nikolamand-db commented on code in PR #46077:
URL: https://github.com/apache/spark/pull/46077#discussion_r1567057271
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala:
##########
@@ -58,7 +62,11 @@ abstract class StringRegexExpression extends BinaryExpression
} else {
// Let it raise exception if couldn't compile the regex string
try {
- Pattern.compile(escape(str))
+ var patternFlags: Int = 0
+ if
(CollationFactory.fetchCollation(collationId).supportsLowercaseEquality) {
+ patternFlags = Pattern.CASE_INSENSITIVE
Review Comment:
Same comment as for "u" and "i" flags, we need to add
`Pattern.UNICODE_CASE`. Also, consider using `val` instead of `var`.
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala:
##########
@@ -543,17 +552,23 @@ case class RLike(left: Expression, right: Expression)
extends StringRegexExpress
case class StringSplit(str: Expression, regex: Expression, limit: Expression)
extends TernaryExpression with ImplicitCastInputTypes with NullIntolerant {
- override def dataType: DataType = ArrayType(StringType, containsNull = false)
- override def inputTypes: Seq[DataType] = Seq(StringType, StringType,
IntegerType)
+ override def dataType: DataType = ArrayType(str.dataType, containsNull =
false)
+ override def inputTypes: Seq[AbstractDataType] =
+ Seq(StringTypeBinaryLcase, StringTypeAnyCollation, IntegerType)
override def first: Expression = str
override def second: Expression = regex
override def third: Expression = limit
+ final lazy val collationId: Int =
str.dataType.asInstanceOf[StringType].collationId
+
def this(exp: Expression, regex: Expression) = this(exp, regex, Literal(-1))
override def nullSafeEval(string: Any, regex: Any, limit: Any): Any = {
- val strings = string.asInstanceOf[UTF8String].split(
- regex.asInstanceOf[UTF8String], limit.asInstanceOf[Int])
+ var pattern = regex.asInstanceOf[UTF8String]
+ if
(CollationFactory.fetchCollation(collationId).supportsLowercaseEquality) {
+ pattern = CollationSupport.lowercaseRegex(pattern)
+ }
Review Comment:
We can avoid using var here and in other places.
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java:
##########
@@ -143,7 +143,11 @@ public static boolean execICU(final UTF8String l, final
UTF8String r,
* Collation-aware regexp expressions.
*/
- // TODO: Add more collation-aware regexp expressions.
+ private static final UTF8String lowercaseRegexPrefix =
UTF8String.fromString("(?i)");
Review Comment:
We need both "u" and "i" flags, per
[documentation](https://docs.oracle.com/javase/tutorial/essential/regex/pattern.html)
for "u" flag:
> When this flag is specified then case-insensitive matching, when enabled
by the CASE_INSENSITIVE flag, is done in a manner consistent with the Unicode
Standard. By default, case-insensitive matching assumes that only characters in
the US-ASCII charset are being matched.
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala:
##########
@@ -543,17 +552,23 @@ case class RLike(left: Expression, right: Expression)
extends StringRegexExpress
case class StringSplit(str: Expression, regex: Expression, limit: Expression)
extends TernaryExpression with ImplicitCastInputTypes with NullIntolerant {
- override def dataType: DataType = ArrayType(StringType, containsNull = false)
- override def inputTypes: Seq[DataType] = Seq(StringType, StringType,
IntegerType)
+ override def dataType: DataType = ArrayType(str.dataType, containsNull =
false)
+ override def inputTypes: Seq[AbstractDataType] =
+ Seq(StringTypeBinaryLcase, StringTypeAnyCollation, IntegerType)
override def first: Expression = str
override def second: Expression = regex
override def third: Expression = limit
+ final lazy val collationId: Int =
str.dataType.asInstanceOf[StringType].collationId
+
def this(exp: Expression, regex: Expression) = this(exp, regex, Literal(-1))
override def nullSafeEval(string: Any, regex: Any, limit: Any): Any = {
- val strings = string.asInstanceOf[UTF8String].split(
- regex.asInstanceOf[UTF8String], limit.asInstanceOf[Int])
+ var pattern = regex.asInstanceOf[UTF8String]
+ if
(CollationFactory.fetchCollation(collationId).supportsLowercaseEquality) {
+ pattern = CollationSupport.lowercaseRegex(pattern)
+ }
Review Comment:
```suggestion
val pattern = if
(CollationFactory.fetchCollation(collationId).supportsLowercaseEquality) {
CollationSupport.lowercaseRegex(regex.asInstanceOf[UTF8String])
} else {
regex.asInstanceOf[UTF8String]
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]