Re: [PR] [DRAFT][SPARK-47414][SQL] Lowercase collation support for regexp expressions [spark]

via GitHub Tue, 16 Apr 2024 02:39:59 -0700


nikolamand-db commented on code in PR #46077:
URL: https://github.com/apache/spark/pull/46077#discussion_r1567057271



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala:
##########
@@ -58,7 +62,11 @@ abstract class StringRegexExpression extends BinaryExpression
   } else {
     // Let it raise exception if couldn't compile the regex string
     try {
-      Pattern.compile(escape(str))
+      var patternFlags: Int = 0
+      if 
(CollationFactory.fetchCollation(collationId).supportsLowercaseEquality) {
+        patternFlags = Pattern.CASE_INSENSITIVE

Review Comment:
   Same comment as for "u" and "i" flags, we need to add 
`Pattern.UNICODE_CASE`. Also, consider using `val` instead of `var`.



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala:
##########
@@ -543,17 +552,23 @@ case class RLike(left: Expression, right: Expression) 
extends StringRegexExpress
 case class StringSplit(str: Expression, regex: Expression, limit: Expression)
   extends TernaryExpression with ImplicitCastInputTypes with NullIntolerant {
 
-  override def dataType: DataType = ArrayType(StringType, containsNull = false)
-  override def inputTypes: Seq[DataType] = Seq(StringType, StringType, 
IntegerType)
+  override def dataType: DataType = ArrayType(str.dataType, containsNull = 
false)
+  override def inputTypes: Seq[AbstractDataType] =
+    Seq(StringTypeBinaryLcase, StringTypeAnyCollation, IntegerType)
   override def first: Expression = str
   override def second: Expression = regex
   override def third: Expression = limit
 
+  final lazy val collationId: Int = 
str.dataType.asInstanceOf[StringType].collationId
+
   def this(exp: Expression, regex: Expression) = this(exp, regex, Literal(-1))
 
   override def nullSafeEval(string: Any, regex: Any, limit: Any): Any = {
-    val strings = string.asInstanceOf[UTF8String].split(
-      regex.asInstanceOf[UTF8String], limit.asInstanceOf[Int])
+    var pattern = regex.asInstanceOf[UTF8String]
+    if 
(CollationFactory.fetchCollation(collationId).supportsLowercaseEquality) {
+      pattern = CollationSupport.lowercaseRegex(pattern)
+    }

Review Comment:
   We can avoid using var here and in other places.



##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java:
##########
@@ -143,7 +143,11 @@ public static boolean execICU(final UTF8String l, final 
UTF8String r,
    * Collation-aware regexp expressions.
    */
 
-  // TODO: Add more collation-aware regexp expressions.
+  private static final UTF8String lowercaseRegexPrefix = 
UTF8String.fromString("(?i)");

Review Comment:
   We need both "u" and "i" flags, per 
[documentation](https://docs.oracle.com/javase/tutorial/essential/regex/pattern.html)
 for "u" flag:
   > When this flag is specified then case-insensitive matching, when enabled 
by the CASE_INSENSITIVE flag, is done in a manner consistent with the Unicode 
Standard. By default, case-insensitive matching assumes that only characters in 
the US-ASCII charset are being matched.



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala:
##########
@@ -543,17 +552,23 @@ case class RLike(left: Expression, right: Expression) 
extends StringRegexExpress
 case class StringSplit(str: Expression, regex: Expression, limit: Expression)
   extends TernaryExpression with ImplicitCastInputTypes with NullIntolerant {
 
-  override def dataType: DataType = ArrayType(StringType, containsNull = false)
-  override def inputTypes: Seq[DataType] = Seq(StringType, StringType, 
IntegerType)
+  override def dataType: DataType = ArrayType(str.dataType, containsNull = 
false)
+  override def inputTypes: Seq[AbstractDataType] =
+    Seq(StringTypeBinaryLcase, StringTypeAnyCollation, IntegerType)
   override def first: Expression = str
   override def second: Expression = regex
   override def third: Expression = limit
 
+  final lazy val collationId: Int = 
str.dataType.asInstanceOf[StringType].collationId
+
   def this(exp: Expression, regex: Expression) = this(exp, regex, Literal(-1))
 
   override def nullSafeEval(string: Any, regex: Any, limit: Any): Any = {
-    val strings = string.asInstanceOf[UTF8String].split(
-      regex.asInstanceOf[UTF8String], limit.asInstanceOf[Int])
+    var pattern = regex.asInstanceOf[UTF8String]
+    if 
(CollationFactory.fetchCollation(collationId).supportsLowercaseEquality) {
+      pattern = CollationSupport.lowercaseRegex(pattern)
+    }

Review Comment:
   ```suggestion
       val pattern = if 
(CollationFactory.fetchCollation(collationId).supportsLowercaseEquality) {
         CollationSupport.lowercaseRegex(regex.asInstanceOf[UTF8String])
       } else {
         regex.asInstanceOf[UTF8String]
       }
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [DRAFT][SPARK-47414][SQL] Lowercase collation support for regexp expressions [spark]

Reply via email to