mkaravel commented on code in PR #46720:
URL: https://github.com/apache/spark/pull/46720#discussion_r1612654646
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -135,22 +135,90 @@ public static UTF8String lowercaseReplace(final
UTF8String src, final UTF8String
return buf.build();
}
+ /**
+ * Convert the input string to uppercase using the ICU root locale rules.
+ *
+ * @param target the input string
+ * @return the uppercase string
+ */
+ public static UTF8String toUpperCase(final UTF8String target) {
+ return UTF8String.fromString(toUpperCase(target.toString()));
+ }
+ public static String toUpperCase(final String target) {
+ return UCharacter.toUpperCase(target);
+ }
+
+ /**
+ * Convert the input string to uppercase using the specified ICU collation
rules.
+ *
+ * @param target the input string
+ * @return the uppercase string
+ */
+ public static UTF8String toUpperCase(final UTF8String target, final int
collationId) {
+ return UTF8String.fromString(toUpperCase(target.toString(), collationId));
+ }
public static String toUpperCase(final String target, final int collationId)
{
ULocale locale = CollationFactory.fetchCollation(collationId)
.collator.getLocale(ULocale.ACTUAL_LOCALE);
return UCharacter.toUpperCase(locale, target);
}
+ /**
+ * Convert the input string to lowercase using the ICU root locale rules.
+ *
+ * @param target the input string
+ * @return the lowercase string
+ */
+ public static UTF8String toLowerCase(final UTF8String target) {
+ return UTF8String.fromString(toLowerCase(target.toString()));
+ }
+ public static String toLowerCase(final String target) {
+ return UCharacter.toLowerCase(target);
+ }
+
+ /**
+ * Convert the input string to lowercase using the specified ICU collation
rules.
+ *
+ * @param target the input string
+ * @return the lowercase string
+ */
+ public static UTF8String toLowerCase(final UTF8String target, final int
collationId) {
+ return UTF8String.fromString(toLowerCase(target.toString(), collationId));
+ }
public static String toLowerCase(final String target, final int collationId)
{
ULocale locale = CollationFactory.fetchCollation(collationId)
.collator.getLocale(ULocale.ACTUAL_LOCALE);
return UCharacter.toLowerCase(locale, target);
}
+ /**
+ * Convert the input string to lowercase using the ICU root locale rules.
+ *
+ * @param target the input string
+ * @return the lowercase string
+ */
+ public static UTF8String toTitleCase(final UTF8String target) {
+ return UTF8String.fromString(toTitleCase(target.toString()));
+ }
+ public static String toTitleCase(final String target) {
+ BreakIterator wordIterator = BreakIterator.getWordInstance();
+ return UCharacter.toTitleCase(target, wordIterator);
+ }
+
+ /**
+ * Convert the input string to lowercase using the specified ICU collation
rules.
+ *
+ * @param target the input string
+ * @return the lowercase string
+ */
+ public static UTF8String toTitleCase(final UTF8String target, final int
collationId) {
+ return UTF8String.fromString(toTitleCase(target.toString(), collationId));
+ }
public static String toTitleCase(final String target, final int collationId)
{
ULocale locale = CollationFactory.fetchCollation(collationId)
.collator.getLocale(ULocale.ACTUAL_LOCALE);
- return UCharacter.toTitleCase(locale, target,
BreakIterator.getWordInstance(locale));
+ BreakIterator wordIterator = BreakIterator.getWordInstance(locale);
+ return UCharacter.toTitleCase(locale, target, wordIterator);
Review Comment:
Why are we making this change?
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -135,22 +135,90 @@ public static UTF8String lowercaseReplace(final
UTF8String src, final UTF8String
return buf.build();
}
+ /**
+ * Convert the input string to uppercase using the ICU root locale rules.
+ *
+ * @param target the input string
+ * @return the uppercase string
+ */
+ public static UTF8String toUpperCase(final UTF8String target) {
+ return UTF8String.fromString(toUpperCase(target.toString()));
+ }
+ public static String toUpperCase(final String target) {
+ return UCharacter.toUpperCase(target);
+ }
+
+ /**
+ * Convert the input string to uppercase using the specified ICU collation
rules.
+ *
+ * @param target the input string
+ * @return the uppercase string
+ */
+ public static UTF8String toUpperCase(final UTF8String target, final int
collationId) {
+ return UTF8String.fromString(toUpperCase(target.toString(), collationId));
+ }
public static String toUpperCase(final String target, final int collationId)
{
ULocale locale = CollationFactory.fetchCollation(collationId)
.collator.getLocale(ULocale.ACTUAL_LOCALE);
return UCharacter.toUpperCase(locale, target);
}
+ /**
+ * Convert the input string to lowercase using the ICU root locale rules.
+ *
+ * @param target the input string
+ * @return the lowercase string
+ */
+ public static UTF8String toLowerCase(final UTF8String target) {
+ return UTF8String.fromString(toLowerCase(target.toString()));
+ }
+ public static String toLowerCase(final String target) {
+ return UCharacter.toLowerCase(target);
+ }
+
+ /**
+ * Convert the input string to lowercase using the specified ICU collation
rules.
+ *
+ * @param target the input string
+ * @return the lowercase string
+ */
+ public static UTF8String toLowerCase(final UTF8String target, final int
collationId) {
+ return UTF8String.fromString(toLowerCase(target.toString(), collationId));
+ }
public static String toLowerCase(final String target, final int collationId)
{
ULocale locale = CollationFactory.fetchCollation(collationId)
.collator.getLocale(ULocale.ACTUAL_LOCALE);
return UCharacter.toLowerCase(locale, target);
}
+ /**
+ * Convert the input string to lowercase using the ICU root locale rules.
Review Comment:
```suggestion
* Convert the input string to titlecase using the ICU root locale rules.
```
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java:
##########
@@ -208,87 +208,99 @@ public static boolean execICU(final UTF8String l, final
UTF8String r,
public static class Upper {
public static UTF8String exec(final UTF8String v, final int collationId) {
CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
- if (collation.supportsBinaryEquality ||
collation.supportsLowercaseEquality) {
+ if (collation.supportsBinaryEquality) {
return execUTF8(v);
- } else {
+ } else if (collation.supportsLowercaseEquality) {
+ return execLowercase(v);
+ } else {
return execICU(v, collationId);
}
}
public static String genCode(final String v, final int collationId) {
CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.Upper.exec";
- if (collation.supportsBinaryEquality ||
collation.supportsLowercaseEquality) {
+ if (collation.supportsBinaryEquality) {
return String.format(expr + "UTF8(%s)", v);
- } else {
+ } else if (collation.supportsLowercaseEquality) {
+ return String.format(expr + "Lowercase(%s)", v);
+ } else {
return String.format(expr + "ICU(%s, %d)", v, collationId);
}
}
public static UTF8String execUTF8(final UTF8String v) {
return v.toUpperCase();
}
+ public static UTF8String execLowercase(final UTF8String v) {
+ return CollationAwareUTF8String.toUpperCase(v);
+ }
public static UTF8String execICU(final UTF8String v, final int
collationId) {
- return
UTF8String.fromString(CollationAwareUTF8String.toUpperCase(v.toString(),
collationId));
+ return CollationAwareUTF8String.toUpperCase(v, collationId);
}
}
public static class Lower {
public static UTF8String exec(final UTF8String v, final int collationId) {
CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
- if (collation.supportsBinaryEquality ||
collation.supportsLowercaseEquality) {
+ if (collation.supportsBinaryEquality) {
return execUTF8(v);
+ } else if (collation.supportsLowercaseEquality) {
+ return execLowercase(v);
} else {
return execICU(v, collationId);
}
}
public static String genCode(final String v, final int collationId) {
CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.Lower.exec";
- if (collation.supportsBinaryEquality ||
collation.supportsLowercaseEquality) {
+ if (collation.supportsBinaryEquality) {
return String.format(expr + "UTF8(%s)", v);
- } else {
+ } else if (collation.supportsLowercaseEquality) {
+ return String.format(expr + "Lowercase(%s)", v);
+ } else {
return String.format(expr + "ICU(%s, %d)", v, collationId);
}
}
public static UTF8String execUTF8(final UTF8String v) {
return v.toLowerCase();
}
+ public static UTF8String execLowercase(final UTF8String v) {
+ return CollationAwareUTF8String.toLowerCase(v);
+ }
public static UTF8String execICU(final UTF8String v, final int
collationId) {
- return
UTF8String.fromString(CollationAwareUTF8String.toLowerCase(v.toString(),
collationId));
+ return CollationAwareUTF8String.toLowerCase(v, collationId);
}
}
public static class InitCap {
public static UTF8String exec(final UTF8String v, final int collationId) {
CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
- if (collation.supportsBinaryEquality ||
collation.supportsLowercaseEquality) {
+ if (collation.supportsBinaryEquality) {
return execUTF8(v);
+ } else if (collation.supportsLowercaseEquality) {
+ return execLowercase(v);
} else {
return execICU(v, collationId);
}
}
-
public static String genCode(final String v, final int collationId) {
CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.InitCap.exec";
- if (collation.supportsBinaryEquality ||
collation.supportsLowercaseEquality) {
+ if (collation.supportsBinaryEquality) {
return String.format(expr + "UTF8(%s)", v);
+ } else if (collation.supportsLowercaseEquality) {
+ return String.format(expr + "Lowercase(%s)", v);
} else {
return String.format(expr + "ICU(%s, %d)", v, collationId);
}
}
-
public static UTF8String execUTF8(final UTF8String v) {
return v.toLowerCase().toTitleCase();
}
-
+ public static UTF8String execLowercase(final UTF8String v) {
+ return CollationAwareUTF8String.toTitleCase(v);
+ }
public static UTF8String execICU(final UTF8String v, final int
collationId) {
- return UTF8String.fromString(
- CollationAwareUTF8String.toTitleCase(
- CollationAwareUTF8String.toLowerCase(
- v.toString(),
- collationId
- ),
- collationId));
+ return CollationAwareUTF8String.toTitleCase(v, collationId);
Review Comment:
As I mention in another comment: this is going to title case letters not
just after the ASCII space, but a whole lot more. This is something to be
discussed.
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java:
##########
@@ -208,87 +208,99 @@ public static boolean execICU(final UTF8String l, final
UTF8String r,
public static class Upper {
public static UTF8String exec(final UTF8String v, final int collationId) {
CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
- if (collation.supportsBinaryEquality ||
collation.supportsLowercaseEquality) {
+ if (collation.supportsBinaryEquality) {
return execUTF8(v);
- } else {
+ } else if (collation.supportsLowercaseEquality) {
+ return execLowercase(v);
+ } else {
return execICU(v, collationId);
}
}
public static String genCode(final String v, final int collationId) {
CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.Upper.exec";
- if (collation.supportsBinaryEquality ||
collation.supportsLowercaseEquality) {
+ if (collation.supportsBinaryEquality) {
return String.format(expr + "UTF8(%s)", v);
- } else {
+ } else if (collation.supportsLowercaseEquality) {
+ return String.format(expr + "Lowercase(%s)", v);
+ } else {
return String.format(expr + "ICU(%s, %d)", v, collationId);
}
}
public static UTF8String execUTF8(final UTF8String v) {
return v.toUpperCase();
}
+ public static UTF8String execLowercase(final UTF8String v) {
+ return CollationAwareUTF8String.toUpperCase(v);
+ }
public static UTF8String execICU(final UTF8String v, final int
collationId) {
- return
UTF8String.fromString(CollationAwareUTF8String.toUpperCase(v.toString(),
collationId));
+ return CollationAwareUTF8String.toUpperCase(v, collationId);
}
}
public static class Lower {
public static UTF8String exec(final UTF8String v, final int collationId) {
CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
- if (collation.supportsBinaryEquality ||
collation.supportsLowercaseEquality) {
+ if (collation.supportsBinaryEquality) {
return execUTF8(v);
+ } else if (collation.supportsLowercaseEquality) {
+ return execLowercase(v);
} else {
return execICU(v, collationId);
}
}
public static String genCode(final String v, final int collationId) {
CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.Lower.exec";
- if (collation.supportsBinaryEquality ||
collation.supportsLowercaseEquality) {
+ if (collation.supportsBinaryEquality) {
return String.format(expr + "UTF8(%s)", v);
- } else {
+ } else if (collation.supportsLowercaseEquality) {
+ return String.format(expr + "Lowercase(%s)", v);
+ } else {
return String.format(expr + "ICU(%s, %d)", v, collationId);
}
}
public static UTF8String execUTF8(final UTF8String v) {
return v.toLowerCase();
}
+ public static UTF8String execLowercase(final UTF8String v) {
+ return CollationAwareUTF8String.toLowerCase(v);
+ }
public static UTF8String execICU(final UTF8String v, final int
collationId) {
- return
UTF8String.fromString(CollationAwareUTF8String.toLowerCase(v.toString(),
collationId));
+ return CollationAwareUTF8String.toLowerCase(v, collationId);
}
}
public static class InitCap {
public static UTF8String exec(final UTF8String v, final int collationId) {
CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
- if (collation.supportsBinaryEquality ||
collation.supportsLowercaseEquality) {
+ if (collation.supportsBinaryEquality) {
return execUTF8(v);
+ } else if (collation.supportsLowercaseEquality) {
+ return execLowercase(v);
} else {
return execICU(v, collationId);
}
}
-
public static String genCode(final String v, final int collationId) {
CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.InitCap.exec";
- if (collation.supportsBinaryEquality ||
collation.supportsLowercaseEquality) {
+ if (collation.supportsBinaryEquality) {
return String.format(expr + "UTF8(%s)", v);
+ } else if (collation.supportsLowercaseEquality) {
+ return String.format(expr + "Lowercase(%s)", v);
} else {
return String.format(expr + "ICU(%s, %d)", v, collationId);
}
}
-
public static UTF8String execUTF8(final UTF8String v) {
return v.toLowerCase().toTitleCase();
}
-
Review Comment:
Are we using this to implement initcap? If so, this is a breaking change (in
many ways).
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -135,22 +135,90 @@ public static UTF8String lowercaseReplace(final
UTF8String src, final UTF8String
return buf.build();
}
+ /**
+ * Convert the input string to uppercase using the ICU root locale rules.
+ *
+ * @param target the input string
+ * @return the uppercase string
+ */
+ public static UTF8String toUpperCase(final UTF8String target) {
+ return UTF8String.fromString(toUpperCase(target.toString()));
+ }
+ public static String toUpperCase(final String target) {
+ return UCharacter.toUpperCase(target);
+ }
+
+ /**
+ * Convert the input string to uppercase using the specified ICU collation
rules.
+ *
+ * @param target the input string
+ * @return the uppercase string
+ */
+ public static UTF8String toUpperCase(final UTF8String target, final int
collationId) {
+ return UTF8String.fromString(toUpperCase(target.toString(), collationId));
+ }
public static String toUpperCase(final String target, final int collationId)
{
ULocale locale = CollationFactory.fetchCollation(collationId)
.collator.getLocale(ULocale.ACTUAL_LOCALE);
return UCharacter.toUpperCase(locale, target);
}
+ /**
+ * Convert the input string to lowercase using the ICU root locale rules.
+ *
+ * @param target the input string
+ * @return the lowercase string
+ */
+ public static UTF8String toLowerCase(final UTF8String target) {
+ return UTF8String.fromString(toLowerCase(target.toString()));
+ }
+ public static String toLowerCase(final String target) {
+ return UCharacter.toLowerCase(target);
+ }
+
+ /**
+ * Convert the input string to lowercase using the specified ICU collation
rules.
+ *
+ * @param target the input string
+ * @return the lowercase string
+ */
+ public static UTF8String toLowerCase(final UTF8String target, final int
collationId) {
+ return UTF8String.fromString(toLowerCase(target.toString(), collationId));
+ }
public static String toLowerCase(final String target, final int collationId)
{
ULocale locale = CollationFactory.fetchCollation(collationId)
.collator.getLocale(ULocale.ACTUAL_LOCALE);
return UCharacter.toLowerCase(locale, target);
}
+ /**
+ * Convert the input string to lowercase using the ICU root locale rules.
+ *
+ * @param target the input string
+ * @return the lowercase string
Review Comment:
```suggestion
* @return the titlecase string
```
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -135,22 +135,90 @@ public static UTF8String lowercaseReplace(final
UTF8String src, final UTF8String
return buf.build();
}
+ /**
+ * Convert the input string to uppercase using the ICU root locale rules.
+ *
+ * @param target the input string
+ * @return the uppercase string
+ */
+ public static UTF8String toUpperCase(final UTF8String target) {
+ return UTF8String.fromString(toUpperCase(target.toString()));
+ }
+ public static String toUpperCase(final String target) {
+ return UCharacter.toUpperCase(target);
+ }
+
+ /**
+ * Convert the input string to uppercase using the specified ICU collation
rules.
+ *
+ * @param target the input string
+ * @return the uppercase string
+ */
+ public static UTF8String toUpperCase(final UTF8String target, final int
collationId) {
+ return UTF8String.fromString(toUpperCase(target.toString(), collationId));
+ }
public static String toUpperCase(final String target, final int collationId)
{
ULocale locale = CollationFactory.fetchCollation(collationId)
.collator.getLocale(ULocale.ACTUAL_LOCALE);
return UCharacter.toUpperCase(locale, target);
}
+ /**
+ * Convert the input string to lowercase using the ICU root locale rules.
+ *
+ * @param target the input string
+ * @return the lowercase string
+ */
+ public static UTF8String toLowerCase(final UTF8String target) {
+ return UTF8String.fromString(toLowerCase(target.toString()));
+ }
+ public static String toLowerCase(final String target) {
+ return UCharacter.toLowerCase(target);
+ }
+
+ /**
+ * Convert the input string to lowercase using the specified ICU collation
rules.
+ *
+ * @param target the input string
+ * @return the lowercase string
+ */
+ public static UTF8String toLowerCase(final UTF8String target, final int
collationId) {
+ return UTF8String.fromString(toLowerCase(target.toString(), collationId));
+ }
public static String toLowerCase(final String target, final int collationId)
{
ULocale locale = CollationFactory.fetchCollation(collationId)
.collator.getLocale(ULocale.ACTUAL_LOCALE);
return UCharacter.toLowerCase(locale, target);
}
+ /**
+ * Convert the input string to lowercase using the ICU root locale rules.
+ *
+ * @param target the input string
+ * @return the lowercase string
+ */
+ public static UTF8String toTitleCase(final UTF8String target) {
+ return UTF8String.fromString(toTitleCase(target.toString()));
+ }
+ public static String toTitleCase(final String target) {
+ BreakIterator wordIterator = BreakIterator.getWordInstance();
+ return UCharacter.toTitleCase(target, wordIterator);
+ }
Review Comment:
I suggest we do this in a separate PR and probably think a bit more about
the intended behavior.
If this is meant to be used for implementing initcap, notice that it will
break the string not all at ASCII space, but also a lot of other characters
(and it is context sensitive).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]