Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-18 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1570050123


##
common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java:
##
@@ -261,6 +261,84 @@ public void testEndsWith() throws SparkException {
 assertEndsWith("The i̇o", "İo", "UNICODE_CI", true);
   }
 
+  private void assertStringInstr(String string, String substring, String 
collationName,
+  Integer expected) throws SparkException {
+UTF8String str = UTF8String.fromString(string);
+UTF8String substr = UTF8String.fromString(substring);
+int collationId = CollationFactory.collationNameToId(collationName);
+assertEquals(expected, CollationSupport.StringInstr.exec(str, substr, 
collationId) + 1);
+  }
+
+  @Test
+  public void testStringInstr() throws SparkException {
+assertStringInstr("aaads", "Aa", "UTF8_BINARY", 0);
+assertStringInstr("aaaDs", "de", "UTF8_BINARY", 0);
+assertStringInstr("aaads", "ds", "UTF8_BINARY", 4);
+assertStringInstr("", "", "UTF8_BINARY", 1);
+assertStringInstr("", "", "UTF8_BINARY", 0);
+assertStringInstr("test大千世界X大千世界", "大千", "UTF8_BINARY", 5);
+assertStringInstr("test大千世界X大千世界", "界X", "UTF8_BINARY", 8);
+assertStringInstr("aaads", "Aa", "UTF8_BINARY_LCASE", 1);
+assertStringInstr("aaaDs", "de", "UTF8_BINARY_LCASE", 0);
+assertStringInstr("aaaDs", "ds", "UTF8_BINARY_LCASE", 4);
+assertStringInstr("", "", "UTF8_BINARY_LCASE", 1);
+assertStringInstr("", "", "UTF8_BINARY_LCASE", 0);
+assertStringInstr("test大千世界X大千世界", "大千", "UTF8_BINARY_LCASE", 5);
+assertStringInstr("test大千世界X大千世界", "界x", "UTF8_BINARY_LCASE", 8);
+assertStringInstr("aaads", "Aa", "UNICODE", 0);
+assertStringInstr("aaads", "aa", "UNICODE", 1);
+assertStringInstr("aaads", "de", "UNICODE", 0);
+assertStringInstr("", "", "UNICODE", 1);
+assertStringInstr("", "", "UNICODE", 0);
+assertStringInstr("test大千世界X大千世界", "界x", "UNICODE", 0);
+assertStringInstr("test大千世界X大千世界", "界X", "UNICODE", 8);
+assertStringInstr("aaads", "AD", "UNICODE_CI", 3);
+assertStringInstr("aaads", "dS", "UNICODE_CI", 4);
+assertStringInstr("test大千世界X大千世界", "界y", "UNICODE_CI", 0);
+assertStringInstr("test大千世界X大千世界", "界x", "UNICODE_CI", 8);
+  }

Review Comment:
   ```suggestion
   assertStringInstr("abİo12", "i̇o", "UNICODE_CI", 3);
   assertStringInstr("abi̇o12", "İo", "UNICODE_CI", 3);
 }
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-18 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1570050746


##
common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java:
##
@@ -261,6 +261,84 @@ public void testEndsWith() throws SparkException {
 assertEndsWith("The i̇o", "İo", "UNICODE_CI", true);
   }
 
+  private void assertStringInstr(String string, String substring, String 
collationName,
+  Integer expected) throws SparkException {
+UTF8String str = UTF8String.fromString(string);
+UTF8String substr = UTF8String.fromString(substring);
+int collationId = CollationFactory.collationNameToId(collationName);
+assertEquals(expected, CollationSupport.StringInstr.exec(str, substr, 
collationId) + 1);
+  }
+
+  @Test
+  public void testStringInstr() throws SparkException {
+assertStringInstr("aaads", "Aa", "UTF8_BINARY", 0);
+assertStringInstr("aaaDs", "de", "UTF8_BINARY", 0);
+assertStringInstr("aaads", "ds", "UTF8_BINARY", 4);
+assertStringInstr("", "", "UTF8_BINARY", 1);
+assertStringInstr("", "", "UTF8_BINARY", 0);
+assertStringInstr("test大千世界X大千世界", "大千", "UTF8_BINARY", 5);
+assertStringInstr("test大千世界X大千世界", "界X", "UTF8_BINARY", 8);
+assertStringInstr("aaads", "Aa", "UTF8_BINARY_LCASE", 1);
+assertStringInstr("aaaDs", "de", "UTF8_BINARY_LCASE", 0);
+assertStringInstr("aaaDs", "ds", "UTF8_BINARY_LCASE", 4);
+assertStringInstr("", "", "UTF8_BINARY_LCASE", 1);
+assertStringInstr("", "", "UTF8_BINARY_LCASE", 0);
+assertStringInstr("test大千世界X大千世界", "大千", "UTF8_BINARY_LCASE", 5);
+assertStringInstr("test大千世界X大千世界", "界x", "UTF8_BINARY_LCASE", 8);
+assertStringInstr("aaads", "Aa", "UNICODE", 0);
+assertStringInstr("aaads", "aa", "UNICODE", 1);
+assertStringInstr("aaads", "de", "UNICODE", 0);
+assertStringInstr("", "", "UNICODE", 1);
+assertStringInstr("", "", "UNICODE", 0);
+assertStringInstr("test大千世界X大千世界", "界x", "UNICODE", 0);
+assertStringInstr("test大千世界X大千世界", "界X", "UNICODE", 8);
+assertStringInstr("aaads", "AD", "UNICODE_CI", 3);
+assertStringInstr("aaads", "dS", "UNICODE_CI", 4);
+assertStringInstr("test大千世界X大千世界", "界y", "UNICODE_CI", 0);
+assertStringInstr("test大千世界X大千世界", "界x", "UNICODE_CI", 8);
+  }
+
+  private void assertFindInSet(String word, String set, String collationName,
+Integer expected) throws SparkException {
+UTF8String w = UTF8String.fromString(word);
+UTF8String s = UTF8String.fromString(set);
+int collationId = CollationFactory.collationNameToId(collationName);
+assertEquals(expected, CollationSupport.FindInSet.exec(w, s, collationId));
+  }
+
+  @Test
+  public void testFindInSet() throws SparkException {
+assertFindInSet("AB", "abc,b,ab,c,def", "UTF8_BINARY", 0);
+assertFindInSet("abc", "abc,b,ab,c,def", "UTF8_BINARY", 1);
+assertFindInSet("def", "abc,b,ab,c,def", "UTF8_BINARY", 5);
+assertFindInSet("d,ef", "abc,b,ab,c,def", "UTF8_BINARY", 0);
+assertFindInSet("", "abc,b,ab,c,def", "UTF8_BINARY", 0);
+assertFindInSet("a", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 0);
+assertFindInSet("c", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 4);
+assertFindInSet("AB", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 3);
+assertFindInSet("AbC", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 1);
+assertFindInSet("abcd", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 0);
+assertFindInSet("d,ef", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 0);
+assertFindInSet("XX", "xx", "UTF8_BINARY_LCASE", 1);
+assertFindInSet("", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 0);
+assertFindInSet("界x", "test,大千,世,界X,大,千,世界", "UTF8_BINARY_LCASE", 4);
+assertFindInSet("a", "abc,b,ab,c,def", "UNICODE", 0);
+assertFindInSet("ab", "abc,b,ab,c,def", "UNICODE", 3);
+assertFindInSet("Ab", "abc,b,ab,c,def", "UNICODE", 0);
+assertFindInSet("d,ef", "abc,b,ab,c,def", "UNICODE", 0);
+assertFindInSet("xx", "xx", "UNICODE", 1);
+assertFindInSet("界x", "test,大千,世,界X,大,千,世界", "UNICODE", 0);
+assertFindInSet("大", "test,大千,世,界X,大,千,世界", "UNICODE", 5);
+assertFindInSet("a", "abc,b,ab,c,def", "UNICODE_CI", 0);
+assertFindInSet("C", "abc,b,ab,c,def", "UNICODE_CI", 4);
+assertFindInSet("DeF", "abc,b,ab,c,dEf", "UNICODE_CI", 5);
+assertFindInSet("DEFG", "abc,b,ab,c,def", "UNICODE_CI", 0);
+assertFindInSet("XX", "xx", "UNICODE_CI", 1);
+assertFindInSet("界x", "test,大千,世,界X,大,千,世界", "UNICODE_CI", 4);
+assertFindInSet("界x", "test,大千,界Xx,世,界X,大,千,世界", "UNICODE_CI", 5);
+assertFindInSet("大", "test,大千,世,界X,大,千,世界", "UNICODE_CI", 5);
+  }

Review Comment:
   (same new test cases as above)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, 

Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-18 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1570041252


##
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala:
##
@@ -18,9 +18,7 @@
 package org.apache.spark.sql.catalyst.analysis
 
 import javax.annotation.Nullable
-
 import scala.annotation.tailrec
-

Review Comment:
   undo these changes



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


miland-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565868020


##
sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala:
##
@@ -17,13 +17,11 @@
 
 package org.apache.spark.sql
 
-import scala.collection.immutable.Seq
-

Review Comment:
   Reverted the change



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565859516


##
sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala:
##
@@ -17,13 +17,11 @@
 
 package org.apache.spark.sql
 
-import scala.collection.immutable.Seq
-

Review Comment:
   are you sure that's the reason? If so, go ahead and remove it
   but I do wonder why this only comes up now, it doesn't have to do with your 
changes



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


miland-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565854152


##
sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala:
##
@@ -17,13 +17,11 @@
 
 package org.apache.spark.sql
 
-import scala.collection.immutable.Seq
-

Review Comment:
   It will not pass the tests because of unused import



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565850753


##
sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala:
##
@@ -17,13 +17,11 @@
 
 package org.apache.spark.sql
 
-import scala.collection.immutable.Seq
-

Review Comment:
   let's avoid unnecessary changes



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


miland-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565815937


##
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java:
##
@@ -138,72 +138,72 @@ public static boolean execICU(final UTF8String l, final 
UTF8String r,
   }
 
   public static class FindInSet {
-public static int exec(final UTF8String l, final UTF8String r, final int 
collationId) {
+public static int exec(final UTF8String word, final UTF8String set, final 
int collationId) {
   CollationFactory.Collation collation = 
CollationFactory.fetchCollation(collationId);
   if (collation.supportsBinaryEquality) {
-return execBinary(l, r);
+return execBinary(word, set);
   } else if (collation.supportsLowercaseEquality) {
-return execLowercase(l, r);
+return execLowercase(word, set);
   } else {
-return execICU(l, r, collationId);
+return execICU(word, set, collationId);
   }
 }
-public static String genCode(final String l, final String r, final int 
collationId) {
+public static String genCode(final String word, final String set, final 
int collationId) {
   CollationFactory.Collation collation = 
CollationFactory.fetchCollation(collationId);
   String expr = "CollationSupport.FindInSet.exec";
   if (collation.supportsBinaryEquality) {
-return String.format(expr + "Binary(%s, %s)", l, r);
+return String.format(expr + "Binary(%s, %s)", word, set);
   } else if (collation.supportsLowercaseEquality) {
-return String.format(expr + "Lowercase(%s, %s)", l, r);
+return String.format(expr + "Lowercase(%s, %s)", word, set);
   } else {
-return String.format(expr + "ICU(%s, %s, %d)", l, r, collationId);
+return String.format(expr + "ICU(%s, %s, %d)", word, set, collationId);
   }
 }
-public static int execBinary(final UTF8String l, final UTF8String r) {
-  return r.findInSet(l);
+public static int execBinary(final UTF8String word, final UTF8String set) {
+  return set.findInSet(word);
 }
-public static int execLowercase(final UTF8String l, final UTF8String r) {
-  return r.toLowerCase().findInSet(l.toLowerCase());
+public static int execLowercase(final UTF8String word, final UTF8String 
set) {
+  return set.toLowerCase().findInSet(word.toLowerCase());
 }
-public static int execICU(final UTF8String l, final UTF8String r,
+public static int execICU(final UTF8String word, final UTF8String set,
   final int collationId) {
-  return CollationAwareUTF8String.findInSet(l, r, collationId);
+  return CollationAwareUTF8String.findInSet(word, set, collationId);
 }
   }
 
-  public static class IndexOf {
-public static int exec(final UTF8String l, final UTF8String r, final int 
start,
+  public static class StringInstr {
+public static int exec(final UTF8String string, final UTF8String substring,
 final int collationId) {
   CollationFactory.Collation collation = 
CollationFactory.fetchCollation(collationId);
   if (collation.supportsBinaryEquality) {
-return execBinary(l, r, start);
+return execBinary(string, substring);
   } else if (collation.supportsLowercaseEquality) {
-return execLowercase(l, r, start);
+return execLowercase(string, substring);
   } else {
-return execICU(l, r, start, collationId);
+return execICU(string, substring, collationId);
   }
 }
-public static String genCode(final String l, final String r, final int 
start,
+public static String genCode(final String string, final String substring, 
final int start,
 final int collationId) {
   CollationFactory.Collation collation = 
CollationFactory.fetchCollation(collationId);
-  String expr = "CollationSupport.IndexOf.exec";
+  String expr = "CollationSupport.StringInstr.exec";
   if (collation.supportsBinaryEquality) {
-return String.format(expr + "Binary(%s, %s, %d)", l, r, start);
+return String.format(expr + "Binary(%s, %s, %d)", string, substring, 
start);
   } else if (collation.supportsLowercaseEquality) {
-return String.format(expr + "Lowercase(%s, %s, %d)", l, r, start);
+return String.format(expr + "Lowercase(%s, %s, %d)", string, 
substring, start);
   } else {
-return String.format(expr + "ICU(%s, %s, %d, %d)", l, r, start, 
collationId);
+return String.format(expr + "ICU(%s, %s, %d, %d)", string, substring, 
start, collationId);
   }
 }
-public static int execBinary(final UTF8String l, final UTF8String r, final 
int start) {
-  return l.indexOf(r, start);
+public static int execBinary(final UTF8String string, final UTF8String 
substring) {
+  return string.indexOf(substring, 0);
 }
-public static int execLowercase(final UTF8String l, 

Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565772062


##
common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java:
##
@@ -249,6 +249,86 @@ public void testEndsWith() throws SparkException {
 assertEndsWith("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false);
   }
 
+  private void assertStringInstr(String string, String substring, String 
collationName,
+  Integer value) throws SparkException {
+UTF8String str = UTF8String.fromString(string);
+UTF8String substr = UTF8String.fromString(substring);
+int collationId = CollationFactory.collationNameToId(collationName);
+
+assertEquals(CollationSupport.StringInstr.exec(str, substr, collationId) + 
1, value);
+  }
+  
+  @Test
+  public void testStringInstr() throws SparkException {
+assertStringInstr("aaads", "Aa", "UTF8_BINARY", 0);
+assertStringInstr("aaaDs", "de", "UTF8_BINARY", 0);
+assertStringInstr("aaads", "ds", "UTF8_BINARY", 4);
+assertStringInstr("", "", "UTF8_BINARY", 1);
+assertStringInstr("", "", "UTF8_BINARY", 0);
+assertStringInstr("test大千世界X大千世界", "大千", "UTF8_BINARY", 5);
+assertStringInstr("test大千世界X大千世界", "界X", "UTF8_BINARY", 8);
+assertStringInstr("aaads", "Aa", "UTF8_BINARY_LCASE", 1);
+assertStringInstr("aaaDs", "de", "UTF8_BINARY_LCASE", 0);
+assertStringInstr("aaaDs", "ds", "UTF8_BINARY_LCASE", 4);
+assertStringInstr("", "", "UTF8_BINARY_LCASE", 1);
+assertStringInstr("", "", "UTF8_BINARY_LCASE", 0);
+assertStringInstr("test大千世界X大千世界", "大千", "UTF8_BINARY_LCASE", 5);
+assertStringInstr("test大千世界X大千世界", "界x", "UTF8_BINARY_LCASE", 8);
+assertStringInstr("aaads", "Aa", "UNICODE", 0);
+assertStringInstr("aaads", "aa", "UNICODE", 1);
+assertStringInstr("aaads", "de", "UNICODE", 0);
+assertStringInstr("", "", "UNICODE", 1);
+assertStringInstr("", "", "UNICODE", 0);
+assertStringInstr("test大千世界X大千世界", "界x", "UNICODE", 0);
+assertStringInstr("test大千世界X大千世界", "界X", "UNICODE", 8);
+assertStringInstr("aaads", "AD", "UNICODE_CI", 3);
+assertStringInstr("aaads", "dS", "UNICODE_CI", 4);
+assertStringInstr("test大千世界X大千世界", "界x", "UNICODE_CI", 8);
+  }
+
+  //word: String, set: String, collationId: Integer, expected: Integer
+  private void assertFindInSet(String word, String set, String collationName,
+ Integer value) throws SparkException {
+UTF8String w = UTF8String.fromString(word);
+UTF8String s = UTF8String.fromString(set);
+int collationId = CollationFactory.collationNameToId(collationName);
+
+assertEquals(CollationSupport.FindInSet.exec(w, s, collationId), value);

Review Comment:
   ```suggestion
   assertEquals(expected, CollationSupport.FindInSet.exec(w, s, 
collationId));
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565771738


##
common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java:
##
@@ -249,6 +249,86 @@ public void testEndsWith() throws SparkException {
 assertEndsWith("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false);
   }
 
+  private void assertStringInstr(String string, String substring, String 
collationName,
+  Integer value) throws SparkException {
+UTF8String str = UTF8String.fromString(string);
+UTF8String substr = UTF8String.fromString(substring);
+int collationId = CollationFactory.collationNameToId(collationName);
+
+assertEquals(CollationSupport.StringInstr.exec(str, substr, collationId) + 
1, value);

Review Comment:
   ```suggestion
   assertEquals(expected, CollationSupport.StringInstr.exec(str, substr, 
collationId) + 1);
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565770706


##
common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java:
##
@@ -249,6 +249,86 @@ public void testEndsWith() throws SparkException {
 assertEndsWith("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false);
   }
 
+  private void assertStringInstr(String string, String substring, String 
collationName,

Review Comment:
   this is important: Vladimir noticed that we've been using assertEquals 
incorrectly in `CollationSupportSuite.java`
   
   please fix this according to his PR changes: 
https://github.com/apache/spark/pull/46058
   
   since all of the asserts pass here, we wouldn't notice this, but the correct 
order is: first **expected** value, and then **received** value



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565766846


##
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java:
##
@@ -138,72 +138,72 @@ public static boolean execICU(final UTF8String l, final 
UTF8String r,
   }
 
   public static class FindInSet {
-public static int exec(final UTF8String l, final UTF8String r, final int 
collationId) {
+public static int exec(final UTF8String word, final UTF8String set, final 
int collationId) {
   CollationFactory.Collation collation = 
CollationFactory.fetchCollation(collationId);
   if (collation.supportsBinaryEquality) {
-return execBinary(l, r);
+return execBinary(word, set);
   } else if (collation.supportsLowercaseEquality) {
-return execLowercase(l, r);
+return execLowercase(word, set);
   } else {
-return execICU(l, r, collationId);
+return execICU(word, set, collationId);
   }
 }
-public static String genCode(final String l, final String r, final int 
collationId) {
+public static String genCode(final String word, final String set, final 
int collationId) {
   CollationFactory.Collation collation = 
CollationFactory.fetchCollation(collationId);
   String expr = "CollationSupport.FindInSet.exec";
   if (collation.supportsBinaryEquality) {
-return String.format(expr + "Binary(%s, %s)", l, r);
+return String.format(expr + "Binary(%s, %s)", word, set);
   } else if (collation.supportsLowercaseEquality) {
-return String.format(expr + "Lowercase(%s, %s)", l, r);
+return String.format(expr + "Lowercase(%s, %s)", word, set);
   } else {
-return String.format(expr + "ICU(%s, %s, %d)", l, r, collationId);
+return String.format(expr + "ICU(%s, %s, %d)", word, set, collationId);
   }
 }
-public static int execBinary(final UTF8String l, final UTF8String r) {
-  return r.findInSet(l);
+public static int execBinary(final UTF8String word, final UTF8String set) {
+  return set.findInSet(word);
 }
-public static int execLowercase(final UTF8String l, final UTF8String r) {
-  return r.toLowerCase().findInSet(l.toLowerCase());
+public static int execLowercase(final UTF8String word, final UTF8String 
set) {
+  return set.toLowerCase().findInSet(word.toLowerCase());
 }
-public static int execICU(final UTF8String l, final UTF8String r,
+public static int execICU(final UTF8String word, final UTF8String set,
   final int collationId) {
-  return CollationAwareUTF8String.findInSet(l, r, collationId);
+  return CollationAwareUTF8String.findInSet(word, set, collationId);
 }
   }
 
-  public static class IndexOf {
-public static int exec(final UTF8String l, final UTF8String r, final int 
start,
+  public static class StringInstr {
+public static int exec(final UTF8String string, final UTF8String substring,
 final int collationId) {
   CollationFactory.Collation collation = 
CollationFactory.fetchCollation(collationId);
   if (collation.supportsBinaryEquality) {
-return execBinary(l, r, start);
+return execBinary(string, substring);
   } else if (collation.supportsLowercaseEquality) {
-return execLowercase(l, r, start);
+return execLowercase(string, substring);
   } else {
-return execICU(l, r, start, collationId);
+return execICU(string, substring, collationId);
   }
 }
-public static String genCode(final String l, final String r, final int 
start,
+public static String genCode(final String string, final String substring, 
final int start,
 final int collationId) {
   CollationFactory.Collation collation = 
CollationFactory.fetchCollation(collationId);
-  String expr = "CollationSupport.IndexOf.exec";
+  String expr = "CollationSupport.StringInstr.exec";
   if (collation.supportsBinaryEquality) {
-return String.format(expr + "Binary(%s, %s, %d)", l, r, start);
+return String.format(expr + "Binary(%s, %s, %d)", string, substring, 
start);
   } else if (collation.supportsLowercaseEquality) {
-return String.format(expr + "Lowercase(%s, %s, %d)", l, r, start);
+return String.format(expr + "Lowercase(%s, %s, %d)", string, 
substring, start);
   } else {
-return String.format(expr + "ICU(%s, %s, %d, %d)", l, r, start, 
collationId);
+return String.format(expr + "ICU(%s, %s, %d, %d)", string, substring, 
start, collationId);
   }
 }
-public static int execBinary(final UTF8String l, final UTF8String r, final 
int start) {
-  return l.indexOf(r, start);
+public static int execBinary(final UTF8String string, final UTF8String 
substring) {
+  return string.indexOf(substring, 0);
 }
-public static int execLowercase(final UTF8String l, final 

Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


miland-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565559690


##
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java:
##
@@ -169,6 +239,46 @@ private static boolean matchAt(final UTF8String target, 
final UTF8String pattern
 pos, pos + pattern.numChars()), pattern, collationId).last() == 0;
 }
 
+private static int findInSet(UTF8String match, UTF8String set, int 
collationId) {
+  if (match.contains(UTF8String.fromString(","))) {
+return 0;
+  }
+
+  StringSearch stringSearch = CollationFactory.getStringSearch(set, match, 
collationId);
+
+  String setString = set.toString();

Review Comment:
   Added



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565559007


##
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala:
##
@@ -974,15 +974,19 @@ case class StringTranslate(srcExpr: Expression, 
matchingExpr: Expression, replac
 case class FindInSet(left: Expression, right: Expression) extends 
BinaryExpression
 with ImplicitCastInputTypes with NullIntolerant {
 
-  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType)
+  final lazy val collationId: Int = 
left.dataType.asInstanceOf[StringType].collationId
+
+  override def inputTypes: Seq[AbstractDataType] =
+Seq(StringTypeAnyCollation, StringTypeAnyCollation)
 
-  override protected def nullSafeEval(word: Any, set: Any): Any =
-set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String])
+  override protected def nullSafeEval(word: Any, set: Any): Any = {
+CollationSupport.FindInSet.
+  exec(word.asInstanceOf[UTF8String], set.asInstanceOf[UTF8String], 
collationId)
+  }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-nullSafeCodeGen(ctx, ev, (word, set) =>
-  s"${ev.value} = $set.findInSet($word);"
-)
+nullSafeCodeGen(ctx, ev, (l, r) =>
+  s"${ev.value} = " + CollationSupport.FindInSet.genCode(l, r, 
collationId) + ";")

Review Comment:
   of course, it's just shorthand (see defineCodeGen impl)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


miland-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r156472


##
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala:
##
@@ -974,15 +974,19 @@ case class StringTranslate(srcExpr: Expression, 
matchingExpr: Expression, replac
 case class FindInSet(left: Expression, right: Expression) extends 
BinaryExpression
 with ImplicitCastInputTypes with NullIntolerant {
 
-  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType)
+  final lazy val collationId: Int = 
left.dataType.asInstanceOf[StringType].collationId
+
+  override def inputTypes: Seq[AbstractDataType] =
+Seq(StringTypeAnyCollation, StringTypeAnyCollation)
 
-  override protected def nullSafeEval(word: Any, set: Any): Any =
-set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String])
+  override protected def nullSafeEval(word: Any, set: Any): Any = {
+CollationSupport.FindInSet.
+  exec(word.asInstanceOf[UTF8String], set.asInstanceOf[UTF8String], 
collationId)
+  }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-nullSafeCodeGen(ctx, ev, (word, set) =>
-  s"${ev.value} = $set.findInSet($word);"
-)
+nullSafeCodeGen(ctx, ev, (l, r) =>
+  s"${ev.value} = " + CollationSupport.FindInSet.genCode(l, r, 
collationId) + ";")

Review Comment:
   It works, but I am not sure if we should go from `nullSafeCodeGen` to just 
`defineCodeGen`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565531354


##
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java:
##
@@ -169,6 +239,46 @@ private static boolean matchAt(final UTF8String target, 
final UTF8String pattern
 pos, pos + pattern.numChars()), pattern, collationId).last() == 0;
 }
 
+private static int findInSet(UTF8String match, UTF8String set, int 
collationId) {
+  if (match.contains(UTF8String.fromString(","))) {
+return 0;
+  }
+
+  StringSearch stringSearch = CollationFactory.getStringSearch(set, match, 
collationId);
+
+  String setString = set.toString();

Review Comment:
   and if you do that, I'd say that the UTF8String implementation for 
`getStringSearch` can amount to:
   
   ```
 public static StringSearch getStringSearch(
 final UTF8String left,
 final UTF8String right,
 final int collationId) {
   return getStringSearch(left.toString(), right.toString(), collationId);
 }
   ```
   
   this way, we can have a more complete API



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565524362


##
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java:
##
@@ -169,6 +239,46 @@ private static boolean matchAt(final UTF8String target, 
final UTF8String pattern
 pos, pos + pattern.numChars()), pattern, collationId).last() == 0;
 }
 
+private static int findInSet(UTF8String match, UTF8String set, int 
collationId) {
+  if (match.contains(UTF8String.fromString(","))) {
+return 0;
+  }
+
+  StringSearch stringSearch = CollationFactory.getStringSearch(set, match, 
collationId);
+
+  String setString = set.toString();

Review Comment:
   I think `CollationFactory.getStringSearch(set, match, collationId)` already 
does `set.toString()`
   
   there's no need to do that twice, let's add:
   
   ```
 public static StringSearch getStringSearch(
 final String targetString,
 final String pattern,
 final int collationId) {
   CharacterIterator target = new StringCharacterIterator(targetString);
   Collator collator = 
CollationFactory.fetchCollation(collationId).collator;
   return new StringSearch(pattern, target, (RuleBasedCollator) collator);
 }
   ```
   
   to `CollationFactory`
   
   this way, we can get the stringSearch and setString instances here, without 
extra String allocations



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565508312


##
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java:
##
@@ -137,6 +137,76 @@ public static boolean execICU(final UTF8String l, final 
UTF8String r,
 }
   }
 
+  public static class FindInSet {
+public static int exec(final UTF8String l, final UTF8String r, final int 
collationId) {
+  CollationFactory.Collation collation = 
CollationFactory.fetchCollation(collationId);
+  if (collation.supportsBinaryEquality) {
+return execBinary(l, r);
+  } else if (collation.supportsLowercaseEquality) {
+return execLowercase(l, r);
+  } else {
+return execICU(l, r, collationId);
+  }
+}
+public static String genCode(final String l, final String r, final int 
collationId) {
+  CollationFactory.Collation collation = 
CollationFactory.fetchCollation(collationId);
+  String expr = "CollationSupport.FindInSet.exec";
+  if (collation.supportsBinaryEquality) {
+return String.format(expr + "Binary(%s, %s)", l, r);
+  } else if (collation.supportsLowercaseEquality) {
+return String.format(expr + "Lowercase(%s, %s)", l, r);
+  } else {
+return String.format(expr + "ICU(%s, %s, %d)", l, r, collationId);
+  }
+}
+public static int execBinary(final UTF8String l, final UTF8String r) {
+  return r.findInSet(l);
+}
+public static int execLowercase(final UTF8String l, final UTF8String r) {
+  return r.toLowerCase().findInSet(l.toLowerCase());
+}
+public static int execICU(final UTF8String l, final UTF8String r,
+  final int collationId) {
+  return CollationAwareUTF8String.findInSet(l, r, collationId);
+}
+  }
+
+  public static class IndexOf {

Review Comment:
   `CollationAwareUTF8String.indexOf` should follow the `UTF8String.indexOf` 
naming, so that part is fine for example
   
   but `CollationSupport.StringInstr` should definitely follow 
`stringExpressions.StringInstr` naming



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565521994


##
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java:
##
@@ -169,6 +239,46 @@ private static boolean matchAt(final UTF8String target, 
final UTF8String pattern
 pos, pos + pattern.numChars()), pattern, collationId).last() == 0;
 }
 
+private static int findInSet(UTF8String match, UTF8String set, int 
collationId) {
+  if (match.contains(UTF8String.fromString(","))) {
+return 0;
+  }
+
+  StringSearch stringSearch = CollationFactory.getStringSearch(set, match, 
collationId);
+
+  String setString = set.toString();

Review Comment:
   I think `CollationFactory.getStringSearch(set, match, collationId)` already 
does set.toString()
   
   there's no need to do that twice, let's add:
   
   ```
 public static StringSearch getStringSearch(
 final String targetString,
 final String pattern,
 final int collationId) {
   CharacterIterator target = new StringCharacterIterator(targetString);
   Collator collator = 
CollationFactory.fetchCollation(collationId).collator;
   return new StringSearch(pattern, target, (RuleBasedCollator) collator);
 }
   ```
   
   to `CollationFactory`
   
   this way, we can get the `stringSearch` and `setString` instances here, 
without extra String allocations



##
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java:
##
@@ -169,6 +239,46 @@ private static boolean matchAt(final UTF8String target, 
final UTF8String pattern
 pos, pos + pattern.numChars()), pattern, collationId).last() == 0;
 }
 
+private static int findInSet(UTF8String match, UTF8String set, int 
collationId) {
+  if (match.contains(UTF8String.fromString(","))) {
+return 0;
+  }
+
+  StringSearch stringSearch = CollationFactory.getStringSearch(set, match, 
collationId);
+
+  String setString = set.toString();

Review Comment:
   I think CollationFactory.getStringSearch(set, match, collationId) already 
does set.toString()
   
   there's no need to do that twice, let's add:
   
 public static StringSearch getStringSearch(
 final String targetString,
 final String pattern,
 final int collationId) {
   CharacterIterator target = new StringCharacterIterator(targetString);
   Collator collator = 
CollationFactory.fetchCollation(collationId).collator;
   return new StringSearch(pattern, target, (RuleBasedCollator) collator);
 }
   to CollationFactory
   
   this way, we can get the stringSearch and setString instances here, without 
extra String allocations



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565521994


##
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java:
##
@@ -169,6 +239,46 @@ private static boolean matchAt(final UTF8String target, 
final UTF8String pattern
 pos, pos + pattern.numChars()), pattern, collationId).last() == 0;
 }
 
+private static int findInSet(UTF8String match, UTF8String set, int 
collationId) {
+  if (match.contains(UTF8String.fromString(","))) {
+return 0;
+  }
+
+  StringSearch stringSearch = CollationFactory.getStringSearch(set, match, 
collationId);
+
+  String setString = set.toString();

Review Comment:
   I think `CollationFactory.getStringSearch(set, match, collationId)` already 
does set.toString()
   
   there's no need to do that twice, let's add:
   
   ```
 public static StringSearch getStringSearch(
 final String targetString,
 final String pattern,
 final int collationId) {
   CharacterIterator target = new StringCharacterIterator(targetString);
   Collator collator = 
CollationFactory.fetchCollation(collationId).collator;
   return new StringSearch(pattern, target, (RuleBasedCollator) collator);
 }
   ```
   
   to `CollationFactory`
   
   this way, we can get the `stringSearch` and `setString` instances here, 
without extra String allocations



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565521994


##
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java:
##
@@ -169,6 +239,46 @@ private static boolean matchAt(final UTF8String target, 
final UTF8String pattern
 pos, pos + pattern.numChars()), pattern, collationId).last() == 0;
 }
 
+private static int findInSet(UTF8String match, UTF8String set, int 
collationId) {
+  if (match.contains(UTF8String.fromString(","))) {
+return 0;
+  }
+
+  StringSearch stringSearch = CollationFactory.getStringSearch(set, match, 
collationId);
+
+  String setString = set.toString();

Review Comment:
   I think `CollationFactory.getStringSearch(set, match, collationId)` already 
does set.toString()
   
   there's no need to do that twice, let's add:
   
   `
 public static StringSearch getStringSearch(
 final String target,
 final String patternString,
 final int collationId) {
   CharacterIterator target = new StringCharacterIterator(left.toString());
   Collator collator = 
CollationFactory.fetchCollation(collationId).collator;
   return new StringSearch(pattern, target, (RuleBasedCollator) collator);
 }
 `
   
   to `CollationFactory`
   
   this way, we can get the `stringSearch` and `setString` instances here, 
without extra String allocations



##
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java:
##
@@ -169,6 +239,46 @@ private static boolean matchAt(final UTF8String target, 
final UTF8String pattern
 pos, pos + pattern.numChars()), pattern, collationId).last() == 0;
 }
 
+private static int findInSet(UTF8String match, UTF8String set, int 
collationId) {
+  if (match.contains(UTF8String.fromString(","))) {
+return 0;
+  }
+
+  StringSearch stringSearch = CollationFactory.getStringSearch(set, match, 
collationId);
+
+  String setString = set.toString();

Review Comment:
   I think `CollationFactory.getStringSearch(set, match, collationId)` already 
does set.toString()
   
   there's no need to do that twice, let's add:
   
   ```
 public static StringSearch getStringSearch(
 final String target,
 final String patternString,
 final int collationId) {
   CharacterIterator target = new StringCharacterIterator(left.toString());
   Collator collator = 
CollationFactory.fetchCollation(collationId).collator;
   return new StringSearch(pattern, target, (RuleBasedCollator) collator);
 }
   ```
   
   to `CollationFactory`
   
   this way, we can get the `stringSearch` and `setString` instances here, 
without extra String allocations



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565521994


##
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java:
##
@@ -169,6 +239,46 @@ private static boolean matchAt(final UTF8String target, 
final UTF8String pattern
 pos, pos + pattern.numChars()), pattern, collationId).last() == 0;
 }
 
+private static int findInSet(UTF8String match, UTF8String set, int 
collationId) {
+  if (match.contains(UTF8String.fromString(","))) {
+return 0;
+  }
+
+  StringSearch stringSearch = CollationFactory.getStringSearch(set, match, 
collationId);
+
+  String setString = set.toString();

Review Comment:
   I think `CollationFactory.getStringSearch(set, match, collationId)` already 
does set.toString()
   
   there's no need to do that twice, let's add:
   `  public static StringSearch getStringSearch(
 final String target,
 final String patternString,
 final int collationId) {
   CharacterIterator target = new StringCharacterIterator(left.toString());
   Collator collator = 
CollationFactory.fetchCollation(collationId).collator;
   return new StringSearch(pattern, target, (RuleBasedCollator) collator);
 }`
   to `CollationFactory`
   
   this way, we can get the `stringSearch` and `setString` instances here, 
without extra String allocations



##
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java:
##
@@ -169,6 +239,46 @@ private static boolean matchAt(final UTF8String target, 
final UTF8String pattern
 pos, pos + pattern.numChars()), pattern, collationId).last() == 0;
 }
 
+private static int findInSet(UTF8String match, UTF8String set, int 
collationId) {
+  if (match.contains(UTF8String.fromString(","))) {
+return 0;
+  }
+
+  StringSearch stringSearch = CollationFactory.getStringSearch(set, match, 
collationId);
+
+  String setString = set.toString();

Review Comment:
   I think `CollationFactory.getStringSearch(set, match, collationId)` already 
does set.toString()
   
   there's no need to do that twice, let's add:
   
   `  public static StringSearch getStringSearch(
 final String target,
 final String patternString,
 final int collationId) {
   CharacterIterator target = new StringCharacterIterator(left.toString());
   Collator collator = 
CollationFactory.fetchCollation(collationId).collator;
   return new StringSearch(pattern, target, (RuleBasedCollator) collator);
 }`
   
   to `CollationFactory`
   
   this way, we can get the `stringSearch` and `setString` instances here, 
without extra String allocations



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565515634


##
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java:
##
@@ -169,6 +239,46 @@ private static boolean matchAt(final UTF8String target, 
final UTF8String pattern
 pos, pos + pattern.numChars()), pattern, collationId).last() == 0;
 }
 
+private static int findInSet(UTF8String match, UTF8String set, int 
collationId) {

Review Comment:
   these parameters should be `final`



##
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java:
##
@@ -169,6 +239,46 @@ private static boolean matchAt(final UTF8String target, 
final UTF8String pattern
 pos, pos + pattern.numChars()), pattern, collationId).last() == 0;
 }
 
+private static int findInSet(UTF8String match, UTF8String set, int 
collationId) {
+  if (match.contains(UTF8String.fromString(","))) {
+return 0;
+  }
+
+  StringSearch stringSearch = CollationFactory.getStringSearch(set, match, 
collationId);
+
+  String setString = set.toString();
+  int wordStart = 0;
+  while ((wordStart = stringSearch.next()) != StringSearch.DONE) {
+boolean isValidStart = wordStart == 0 || setString.charAt(wordStart - 
1) == ',';
+boolean isValidEnd = wordStart + stringSearch.getMatchLength() == 
setString.length()
+|| setString.charAt(wordStart + stringSearch.getMatchLength()) 
== ',';
+
+if (isValidStart && isValidEnd) {
+  int pos = 0;
+  for (int i = 0; i < setString.length() && i < wordStart; i++) {
+if (setString.charAt(i) == ',') {
+  pos++;
+}
+  }
+
+  return pos + 1;
+}
+  }
+
+  return 0;
+}
+
+private static int indexOf(UTF8String target, UTF8String pattern, int 
start, int collationId) {

Review Comment:
   these parameters should be `final`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565510703


##
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala:
##
@@ -974,15 +974,19 @@ case class StringTranslate(srcExpr: Expression, 
matchingExpr: Expression, replac
 case class FindInSet(left: Expression, right: Expression) extends 
BinaryExpression
 with ImplicitCastInputTypes with NullIntolerant {
 
-  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType)
+  final lazy val collationId: Int = 
left.dataType.asInstanceOf[StringType].collationId
+
+  override def inputTypes: Seq[AbstractDataType] =
+Seq(StringTypeAnyCollation, StringTypeAnyCollation)
 
-  override protected def nullSafeEval(word: Any, set: Any): Any =
-set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String])
+  override protected def nullSafeEval(word: Any, set: Any): Any = {
+CollationSupport.FindInSet.
+  exec(word.asInstanceOf[UTF8String], set.asInstanceOf[UTF8String], 
collationId)
+  }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-nullSafeCodeGen(ctx, ev, (word, set) =>
-  s"${ev.value} = $set.findInSet($word);"
-)
+nullSafeCodeGen(ctx, ev, (l, r) =>

Review Comment:
   perhaps we should use "word" & "set" (instead of "l" & "r") to preserve the 
original naming



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565508312


##
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java:
##
@@ -137,6 +137,76 @@ public static boolean execICU(final UTF8String l, final 
UTF8String r,
 }
   }
 
+  public static class FindInSet {
+public static int exec(final UTF8String l, final UTF8String r, final int 
collationId) {
+  CollationFactory.Collation collation = 
CollationFactory.fetchCollation(collationId);
+  if (collation.supportsBinaryEquality) {
+return execBinary(l, r);
+  } else if (collation.supportsLowercaseEquality) {
+return execLowercase(l, r);
+  } else {
+return execICU(l, r, collationId);
+  }
+}
+public static String genCode(final String l, final String r, final int 
collationId) {
+  CollationFactory.Collation collation = 
CollationFactory.fetchCollation(collationId);
+  String expr = "CollationSupport.FindInSet.exec";
+  if (collation.supportsBinaryEquality) {
+return String.format(expr + "Binary(%s, %s)", l, r);
+  } else if (collation.supportsLowercaseEquality) {
+return String.format(expr + "Lowercase(%s, %s)", l, r);
+  } else {
+return String.format(expr + "ICU(%s, %s, %d)", l, r, collationId);
+  }
+}
+public static int execBinary(final UTF8String l, final UTF8String r) {
+  return r.findInSet(l);
+}
+public static int execLowercase(final UTF8String l, final UTF8String r) {
+  return r.toLowerCase().findInSet(l.toLowerCase());
+}
+public static int execICU(final UTF8String l, final UTF8String r,
+  final int collationId) {
+  return CollationAwareUTF8String.findInSet(l, r, collationId);
+}
+  }
+
+  public static class IndexOf {

Review Comment:
   `CollationAwareUTF8String.indexOf` should follow the `UTF8String.indexOf 
naming`, so that part is fine for example
   
   but `CollationSupport.StringInstr` should definitely follow 
`stringExpressions.StringInstr` naming



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565502312


##
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java:
##
@@ -137,6 +137,76 @@ public static boolean execICU(final UTF8String l, final 
UTF8String r,
 }
   }
 
+  public static class FindInSet {
+public static int exec(final UTF8String l, final UTF8String r, final int 
collationId) {
+  CollationFactory.Collation collation = 
CollationFactory.fetchCollation(collationId);
+  if (collation.supportsBinaryEquality) {
+return execBinary(l, r);
+  } else if (collation.supportsLowercaseEquality) {
+return execLowercase(l, r);
+  } else {
+return execICU(l, r, collationId);
+  }
+}
+public static String genCode(final String l, final String r, final int 
collationId) {
+  CollationFactory.Collation collation = 
CollationFactory.fetchCollation(collationId);
+  String expr = "CollationSupport.FindInSet.exec";
+  if (collation.supportsBinaryEquality) {
+return String.format(expr + "Binary(%s, %s)", l, r);
+  } else if (collation.supportsLowercaseEquality) {
+return String.format(expr + "Lowercase(%s, %s)", l, r);
+  } else {
+return String.format(expr + "ICU(%s, %s, %d)", l, r, collationId);
+  }
+}
+public static int execBinary(final UTF8String l, final UTF8String r) {
+  return r.findInSet(l);
+}
+public static int execLowercase(final UTF8String l, final UTF8String r) {
+  return r.toLowerCase().findInSet(l.toLowerCase());
+}
+public static int execICU(final UTF8String l, final UTF8String r,
+  final int collationId) {
+  return CollationAwareUTF8String.findInSet(l, r, collationId);
+}
+  }
+
+  public static class IndexOf {

Review Comment:
   let's keep the naming uniform
   
   if the original expression is `case class StringInstr`, then this should 
probably be:
   `public static class IndexOf`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565502312


##
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java:
##
@@ -137,6 +137,76 @@ public static boolean execICU(final UTF8String l, final 
UTF8String r,
 }
   }
 
+  public static class FindInSet {
+public static int exec(final UTF8String l, final UTF8String r, final int 
collationId) {
+  CollationFactory.Collation collation = 
CollationFactory.fetchCollation(collationId);
+  if (collation.supportsBinaryEquality) {
+return execBinary(l, r);
+  } else if (collation.supportsLowercaseEquality) {
+return execLowercase(l, r);
+  } else {
+return execICU(l, r, collationId);
+  }
+}
+public static String genCode(final String l, final String r, final int 
collationId) {
+  CollationFactory.Collation collation = 
CollationFactory.fetchCollation(collationId);
+  String expr = "CollationSupport.FindInSet.exec";
+  if (collation.supportsBinaryEquality) {
+return String.format(expr + "Binary(%s, %s)", l, r);
+  } else if (collation.supportsLowercaseEquality) {
+return String.format(expr + "Lowercase(%s, %s)", l, r);
+  } else {
+return String.format(expr + "ICU(%s, %s, %d)", l, r, collationId);
+  }
+}
+public static int execBinary(final UTF8String l, final UTF8String r) {
+  return r.findInSet(l);
+}
+public static int execLowercase(final UTF8String l, final UTF8String r) {
+  return r.toLowerCase().findInSet(l.toLowerCase());
+}
+public static int execICU(final UTF8String l, final UTF8String r,
+  final int collationId) {
+  return CollationAwareUTF8String.findInSet(l, r, collationId);
+}
+  }
+
+  public static class IndexOf {

Review Comment:
   let's keep the naming uniform
   
   if the original expression is `case class StringInstr`, then this should 
probably be:
   `public static class IndexOf`



##
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java:
##
@@ -137,6 +137,76 @@ public static boolean execICU(final UTF8String l, final 
UTF8String r,
 }
   }
 
+  public static class FindInSet {
+public static int exec(final UTF8String l, final UTF8String r, final int 
collationId) {
+  CollationFactory.Collation collation = 
CollationFactory.fetchCollation(collationId);
+  if (collation.supportsBinaryEquality) {
+return execBinary(l, r);
+  } else if (collation.supportsLowercaseEquality) {
+return execLowercase(l, r);
+  } else {
+return execICU(l, r, collationId);
+  }
+}
+public static String genCode(final String l, final String r, final int 
collationId) {
+  CollationFactory.Collation collation = 
CollationFactory.fetchCollation(collationId);
+  String expr = "CollationSupport.FindInSet.exec";
+  if (collation.supportsBinaryEquality) {
+return String.format(expr + "Binary(%s, %s)", l, r);
+  } else if (collation.supportsLowercaseEquality) {
+return String.format(expr + "Lowercase(%s, %s)", l, r);
+  } else {
+return String.format(expr + "ICU(%s, %s, %d)", l, r, collationId);
+  }
+}
+public static int execBinary(final UTF8String l, final UTF8String r) {
+  return r.findInSet(l);
+}
+public static int execLowercase(final UTF8String l, final UTF8String r) {
+  return r.toLowerCase().findInSet(l.toLowerCase());
+}
+public static int execICU(final UTF8String l, final UTF8String r,
+  final int collationId) {
+  return CollationAwareUTF8String.findInSet(l, r, collationId);
+}
+  }
+
+  public static class IndexOf {

Review Comment:
   let's keep the naming uniform
   
   if the original expression is `case class StringInstr`, then this should 
probably be:
   `public static class StringInstr`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565494137


##
sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala:
##
@@ -96,6 +95,107 @@ class CollationStringExpressionsSuite
 assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT")
   }
 
+  test("INSTR check result on explicitly collated strings") {

Review Comment:
   Let's try to unify test naming
   
   e.g. `test("Support StringInstr string expression with collation")`if the 
test is in CollationStringExpressionsSuite.scala



##
sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala:
##
@@ -96,6 +95,107 @@ class CollationStringExpressionsSuite
 assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT")
   }
 
+  test("INSTR check result on explicitly collated strings") {
+def testInStr(str: String, substr: String, collationId: Integer, expected: 
Integer): Unit = {
+  val string = Literal.create(str, StringType(collationId))
+  val substring = Literal.create(substr, StringType(collationId))
+
+  checkEvaluation(StringInstr(string, substring), expected)
+}
+
+var collationId = CollationFactory.collationNameToId("UTF8_BINARY")
+testInStr("aaads", "Aa", collationId, 0)
+testInStr("aaaDs", "de", collationId, 0)
+testInStr("aaads", "ds", collationId, 4)
+testInStr("", "", collationId, 1)
+testInStr("", "", collationId, 0)
+// scalastyle:off
+testInStr("test大千世界X大千世界", "大千", collationId, 5)
+testInStr("test大千世界X大千世界", "界X", collationId, 8)
+// scalastyle:on
+
+collationId = CollationFactory.collationNameToId("UTF8_BINARY_LCASE")
+testInStr("aaads", "Aa", collationId, 1)
+testInStr("aaaDs", "de", collationId, 0)
+testInStr("aaaDs", "ds", collationId, 4)
+testInStr("", "", collationId, 1)
+testInStr("", "", collationId, 0)
+// scalastyle:off
+testInStr("test大千世界X大千世界", "大千", collationId, 5)
+testInStr("test大千世界X大千世界", "界x", collationId, 8)
+// scalastyle:on
+
+collationId = CollationFactory.collationNameToId("UNICODE")
+testInStr("aaads", "Aa", collationId, 0)
+testInStr("aaads", "aa", collationId, 1)
+testInStr("aaads", "de", collationId, 0)
+testInStr("", "", collationId, 1)
+testInStr("", "", collationId, 0)
+// scalastyle:off
+testInStr("test大千世界X大千世界", "界x", collationId, 0)
+testInStr("test大千世界X大千世界", "界X", collationId, 8)
+// scalastyle:on
+
+collationId = CollationFactory.collationNameToId("UNICODE_CI")
+testInStr("aaads", "AD", collationId, 3)
+testInStr("aaads", "dS", collationId, 4)
+// scalastyle:off
+testInStr("test大千世界X大千世界", "界x", collationId, 8)
+// scalastyle:on
+  }
+
+  test("FIND_IN_SET check result on explicitly collated strings") {

Review Comment:
   Let's try to unify test naming
   
   e.g. `test("Support FindInSet string expression with collation")` if the 
test is in CollationStringExpressionsSuite.scala



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565494692


##
sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala:
##
@@ -96,6 +95,107 @@ class CollationStringExpressionsSuite
 assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT")
   }
 
+  test("INSTR check result on explicitly collated strings") {
+def testInStr(str: String, substr: String, collationId: Integer, expected: 
Integer): Unit = {
+  val string = Literal.create(str, StringType(collationId))
+  val substring = Literal.create(substr, StringType(collationId))
+
+  checkEvaluation(StringInstr(string, substring), expected)
+}
+
+var collationId = CollationFactory.collationNameToId("UTF8_BINARY")
+testInStr("aaads", "Aa", collationId, 0)
+testInStr("aaaDs", "de", collationId, 0)
+testInStr("aaads", "ds", collationId, 4)
+testInStr("", "", collationId, 1)
+testInStr("", "", collationId, 0)
+// scalastyle:off
+testInStr("test大千世界X大千世界", "大千", collationId, 5)
+testInStr("test大千世界X大千世界", "界X", collationId, 8)
+// scalastyle:on
+
+collationId = CollationFactory.collationNameToId("UTF8_BINARY_LCASE")
+testInStr("aaads", "Aa", collationId, 1)
+testInStr("aaaDs", "de", collationId, 0)
+testInStr("aaaDs", "ds", collationId, 4)
+testInStr("", "", collationId, 1)
+testInStr("", "", collationId, 0)
+// scalastyle:off
+testInStr("test大千世界X大千世界", "大千", collationId, 5)
+testInStr("test大千世界X大千世界", "界x", collationId, 8)
+// scalastyle:on
+
+collationId = CollationFactory.collationNameToId("UNICODE")
+testInStr("aaads", "Aa", collationId, 0)
+testInStr("aaads", "aa", collationId, 1)
+testInStr("aaads", "de", collationId, 0)
+testInStr("", "", collationId, 1)
+testInStr("", "", collationId, 0)
+// scalastyle:off
+testInStr("test大千世界X大千世界", "界x", collationId, 0)
+testInStr("test大千世界X大千世界", "界X", collationId, 8)
+// scalastyle:on
+
+collationId = CollationFactory.collationNameToId("UNICODE_CI")
+testInStr("aaads", "AD", collationId, 3)
+testInStr("aaads", "dS", collationId, 4)
+// scalastyle:off
+testInStr("test大千世界X大千世界", "界x", collationId, 8)
+// scalastyle:on
+  }
+
+  test("FIND_IN_SET check result on explicitly collated strings") {

Review Comment:
   Let's try to unify test naming
   
   e.g. `test("Support FindInSet string expression with collation")` in 
CollationStringExpressionsSuite.scala



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565494692


##
sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala:
##
@@ -96,6 +95,107 @@ class CollationStringExpressionsSuite
 assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT")
   }
 
+  test("INSTR check result on explicitly collated strings") {
+def testInStr(str: String, substr: String, collationId: Integer, expected: 
Integer): Unit = {
+  val string = Literal.create(str, StringType(collationId))
+  val substring = Literal.create(substr, StringType(collationId))
+
+  checkEvaluation(StringInstr(string, substring), expected)
+}
+
+var collationId = CollationFactory.collationNameToId("UTF8_BINARY")
+testInStr("aaads", "Aa", collationId, 0)
+testInStr("aaaDs", "de", collationId, 0)
+testInStr("aaads", "ds", collationId, 4)
+testInStr("", "", collationId, 1)
+testInStr("", "", collationId, 0)
+// scalastyle:off
+testInStr("test大千世界X大千世界", "大千", collationId, 5)
+testInStr("test大千世界X大千世界", "界X", collationId, 8)
+// scalastyle:on
+
+collationId = CollationFactory.collationNameToId("UTF8_BINARY_LCASE")
+testInStr("aaads", "Aa", collationId, 1)
+testInStr("aaaDs", "de", collationId, 0)
+testInStr("aaaDs", "ds", collationId, 4)
+testInStr("", "", collationId, 1)
+testInStr("", "", collationId, 0)
+// scalastyle:off
+testInStr("test大千世界X大千世界", "大千", collationId, 5)
+testInStr("test大千世界X大千世界", "界x", collationId, 8)
+// scalastyle:on
+
+collationId = CollationFactory.collationNameToId("UNICODE")
+testInStr("aaads", "Aa", collationId, 0)
+testInStr("aaads", "aa", collationId, 1)
+testInStr("aaads", "de", collationId, 0)
+testInStr("", "", collationId, 1)
+testInStr("", "", collationId, 0)
+// scalastyle:off
+testInStr("test大千世界X大千世界", "界x", collationId, 0)
+testInStr("test大千世界X大千世界", "界X", collationId, 8)
+// scalastyle:on
+
+collationId = CollationFactory.collationNameToId("UNICODE_CI")
+testInStr("aaads", "AD", collationId, 3)
+testInStr("aaads", "dS", collationId, 4)
+// scalastyle:off
+testInStr("test大千世界X大千世界", "界x", collationId, 8)
+// scalastyle:on
+  }
+
+  test("FIND_IN_SET check result on explicitly collated strings") {

Review Comment:
   Let's try to unify test naming
   
   e.g. `test("Support StringInstr string expression with collation")` in 
CollationStringExpressionsSuite.scala



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565492245


##
sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala:
##
@@ -96,6 +95,107 @@ class CollationStringExpressionsSuite
 assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT")
   }
 
+  test("INSTR check result on explicitly collated strings") {
+def testInStr(str: String, substr: String, collationId: Integer, expected: 
Integer): Unit = {
+  val string = Literal.create(str, StringType(collationId))
+  val substring = Literal.create(substr, StringType(collationId))
+
+  checkEvaluation(StringInstr(string, substring), expected)
+}
+
+var collationId = CollationFactory.collationNameToId("UTF8_BINARY")
+testInStr("aaads", "Aa", collationId, 0)
+testInStr("aaaDs", "de", collationId, 0)
+testInStr("aaads", "ds", collationId, 4)
+testInStr("", "", collationId, 1)
+testInStr("", "", collationId, 0)
+// scalastyle:off
+testInStr("test大千世界X大千世界", "大千", collationId, 5)
+testInStr("test大千世界X大千世界", "界X", collationId, 8)
+// scalastyle:on
+
+collationId = CollationFactory.collationNameToId("UTF8_BINARY_LCASE")
+testInStr("aaads", "Aa", collationId, 1)
+testInStr("aaaDs", "de", collationId, 0)
+testInStr("aaaDs", "ds", collationId, 4)
+testInStr("", "", collationId, 1)
+testInStr("", "", collationId, 0)
+// scalastyle:off
+testInStr("test大千世界X大千世界", "大千", collationId, 5)
+testInStr("test大千世界X大千世界", "界x", collationId, 8)
+// scalastyle:on
+
+collationId = CollationFactory.collationNameToId("UNICODE")
+testInStr("aaads", "Aa", collationId, 0)
+testInStr("aaads", "aa", collationId, 1)
+testInStr("aaads", "de", collationId, 0)
+testInStr("", "", collationId, 1)
+testInStr("", "", collationId, 0)
+// scalastyle:off
+testInStr("test大千世界X大千世界", "界x", collationId, 0)
+testInStr("test大千世界X大千世界", "界X", collationId, 8)
+// scalastyle:on
+
+collationId = CollationFactory.collationNameToId("UNICODE_CI")
+testInStr("aaads", "AD", collationId, 3)
+testInStr("aaads", "dS", collationId, 4)
+// scalastyle:off
+testInStr("test大千世界X大千世界", "界x", collationId, 8)
+// scalastyle:on

Review Comment:
   unit tests should be moved to CollationSupportSuite.java, following the 
appropriate format
   
   please adjust your tests
   
   also, add appropriate sql tests to CollationStringExpressionsSuite.scala



##
sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala:
##
@@ -96,6 +95,107 @@ class CollationStringExpressionsSuite
 assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT")
   }
 
+  test("INSTR check result on explicitly collated strings") {
+def testInStr(str: String, substr: String, collationId: Integer, expected: 
Integer): Unit = {
+  val string = Literal.create(str, StringType(collationId))
+  val substring = Literal.create(substr, StringType(collationId))
+
+  checkEvaluation(StringInstr(string, substring), expected)
+}
+
+var collationId = CollationFactory.collationNameToId("UTF8_BINARY")
+testInStr("aaads", "Aa", collationId, 0)
+testInStr("aaaDs", "de", collationId, 0)
+testInStr("aaads", "ds", collationId, 4)
+testInStr("", "", collationId, 1)
+testInStr("", "", collationId, 0)
+// scalastyle:off
+testInStr("test大千世界X大千世界", "大千", collationId, 5)
+testInStr("test大千世界X大千世界", "界X", collationId, 8)
+// scalastyle:on
+
+collationId = CollationFactory.collationNameToId("UTF8_BINARY_LCASE")
+testInStr("aaads", "Aa", collationId, 1)
+testInStr("aaaDs", "de", collationId, 0)
+testInStr("aaaDs", "ds", collationId, 4)
+testInStr("", "", collationId, 1)
+testInStr("", "", collationId, 0)
+// scalastyle:off
+testInStr("test大千世界X大千世界", "大千", collationId, 5)
+testInStr("test大千世界X大千世界", "界x", collationId, 8)
+// scalastyle:on
+
+collationId = CollationFactory.collationNameToId("UNICODE")
+testInStr("aaads", "Aa", collationId, 0)
+testInStr("aaads", "aa", collationId, 1)
+testInStr("aaads", "de", collationId, 0)
+testInStr("", "", collationId, 1)
+testInStr("", "", collationId, 0)
+// scalastyle:off
+testInStr("test大千世界X大千世界", "界x", collationId, 0)
+testInStr("test大千世界X大千世界", "界X", collationId, 8)
+// scalastyle:on
+
+collationId = CollationFactory.collationNameToId("UNICODE_CI")
+testInStr("aaads", "AD", collationId, 3)
+testInStr("aaads", "dS", collationId, 4)
+// scalastyle:off
+testInStr("test大千世界X大千世界", "界x", collationId, 8)
+// scalastyle:on
+  }
+
+  test("FIND_IN_SET check result on explicitly collated strings") {
+def testFindInSet(word: String, set: String, collationId: Integer, 
expected: Integer): Unit = {
+  val 

Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565490452


##
sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala:
##
@@ -96,6 +95,107 @@ class CollationStringExpressionsSuite
 assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT")
   }
 
+  test("INSTR check result on explicitly collated strings") {
+def testInStr(str: String, substr: String, collationId: Integer, expected: 
Integer): Unit = {
+  val string = Literal.create(str, StringType(collationId))
+  val substring = Literal.create(substr, StringType(collationId))
+
+  checkEvaluation(StringInstr(string, substring), expected)
+}
+
+var collationId = CollationFactory.collationNameToId("UTF8_BINARY")
+testInStr("aaads", "Aa", collationId, 0)
+testInStr("aaaDs", "de", collationId, 0)
+testInStr("aaads", "ds", collationId, 4)
+testInStr("", "", collationId, 1)
+testInStr("", "", collationId, 0)
+// scalastyle:off
+testInStr("test大千世界X大千世界", "大千", collationId, 5)
+testInStr("test大千世界X大千世界", "界X", collationId, 8)
+// scalastyle:on
+
+collationId = CollationFactory.collationNameToId("UTF8_BINARY_LCASE")
+testInStr("aaads", "Aa", collationId, 1)
+testInStr("aaaDs", "de", collationId, 0)
+testInStr("aaaDs", "ds", collationId, 4)
+testInStr("", "", collationId, 1)
+testInStr("", "", collationId, 0)
+// scalastyle:off
+testInStr("test大千世界X大千世界", "大千", collationId, 5)
+testInStr("test大千世界X大千世界", "界x", collationId, 8)
+// scalastyle:on
+
+collationId = CollationFactory.collationNameToId("UNICODE")
+testInStr("aaads", "Aa", collationId, 0)
+testInStr("aaads", "aa", collationId, 1)
+testInStr("aaads", "de", collationId, 0)
+testInStr("", "", collationId, 1)
+testInStr("", "", collationId, 0)
+// scalastyle:off
+testInStr("test大千世界X大千世界", "界x", collationId, 0)
+testInStr("test大千世界X大千世界", "界X", collationId, 8)
+// scalastyle:on
+
+collationId = CollationFactory.collationNameToId("UNICODE_CI")
+testInStr("aaads", "AD", collationId, 3)
+testInStr("aaads", "dS", collationId, 4)
+// scalastyle:off
+testInStr("test大千世界X大千世界", "界x", collationId, 8)
+// scalastyle:on
+  }
+
+  test("FIND_IN_SET check result on explicitly collated strings") {
+def testFindInSet(word: String, set: String, collationId: Integer, 
expected: Integer): Unit = {
+  val w = Literal.create(word, StringType(collationId))
+  val s = Literal.create(set, StringType(collationId))
+
+  checkEvaluation(FindInSet(w, s), expected)
+}

Review Comment:
   we shouldn't use unit tests like this, please take a look at the other tests 
in their appropriate suites:
   
   - CollationSupportSuite.java for unit tests related to newly introduced 
collation-aware support in CollationSupport.java
   - CollationStringExpressionsSuite.scala for e2e SQL tests that should be 
used in limited quantity
   
   read more in the refactor Jira ticket description notes:
   https://issues.apache.org/jira/browse/SPARK-47410



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565489630


##
sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala:
##
@@ -96,6 +95,107 @@ class CollationStringExpressionsSuite
 assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT")
   }
 
+  test("INSTR check result on explicitly collated strings") {
+def testInStr(str: String, substr: String, collationId: Integer, expected: 
Integer): Unit = {
+  val string = Literal.create(str, StringType(collationId))
+  val substring = Literal.create(substr, StringType(collationId))
+
+  checkEvaluation(StringInstr(string, substring), expected)
+}

Review Comment:
   we shouldn't use unit tests like this, please take a look at the other tests 
in their appropriate suites:
   
   - CollationSupportSuite.java for unit tests related to newly introduced 
collation-aware support in CollationSupport.java
   - CollationStringExpressionsSuite.scala for e2e SQL tests that should be 
used in limited quantity
   
   read more in the refactor Jira ticket description notes:
   https://issues.apache.org/jira/browse/SPARK-47410



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565484403


##
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala:
##
@@ -1346,20 +1350,24 @@ case class StringTrimRight(srcStr: Expression, trimStr: 
Option[Expression] = Non
 case class StringInstr(str: Expression, substr: Expression)
   extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant {
 
+  final lazy val collationId: Int = 
left.dataType.asInstanceOf[StringType].collationId
+
   override def left: Expression = str
   override def right: Expression = substr
   override def dataType: DataType = IntegerType
-  override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
+  override def inputTypes: Seq[AbstractDataType] =
+Seq(StringTypeAnyCollation, StringTypeAnyCollation)
 
   override def nullSafeEval(string: Any, sub: Any): Any = {
-string.asInstanceOf[UTF8String].indexOf(sub.asInstanceOf[UTF8String], 0) + 
1
+CollationSupport.IndexOf.
+  exec(string.asInstanceOf[UTF8String], sub.asInstanceOf[UTF8String], 0, 
collationId) + 1
   }
 
   override def prettyName: String = "instr"
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-defineCodeGen(ctx, ev, (l, r) =>
-  s"($l).indexOf($r, 0) + 1")
+  defineCodeGen(ctx, ev, (l, r) =>
+CollationSupport.IndexOf.genCode(l, r, 0, collationId) + " + 1")

Review Comment:
   perhaps we should use "string" & "sub" (instead of "l" & "r") to preserve 
the original naming



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-15 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1565482699


##
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala:
##
@@ -974,15 +974,19 @@ case class StringTranslate(srcExpr: Expression, 
matchingExpr: Expression, replac
 case class FindInSet(left: Expression, right: Expression) extends 
BinaryExpression
 with ImplicitCastInputTypes with NullIntolerant {
 
-  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType)
+  final lazy val collationId: Int = 
left.dataType.asInstanceOf[StringType].collationId
+
+  override def inputTypes: Seq[AbstractDataType] =
+Seq(StringTypeAnyCollation, StringTypeAnyCollation)
 
-  override protected def nullSafeEval(word: Any, set: Any): Any =
-set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String])
+  override protected def nullSafeEval(word: Any, set: Any): Any = {
+CollationSupport.FindInSet.
+  exec(word.asInstanceOf[UTF8String], set.asInstanceOf[UTF8String], 
collationId)
+  }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-nullSafeCodeGen(ctx, ev, (word, set) =>
-  s"${ev.value} = $set.findInSet($word);"
-)
+nullSafeCodeGen(ctx, ev, (l, r) =>
+  s"${ev.value} = " + CollationSupport.FindInSet.genCode(l, r, 
collationId) + ";")

Review Comment:
   Did you try just using `defineCodeGen`?
   
   `defineCodeGen(ctx, ev, (word, set) => 
CollationSupport.FindInSet.genCode(word, set, collationId))`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-11 Thread via GitHub


uros-db commented on PR #45643:
URL: https://github.com/apache/spark/pull/45643#issuecomment-2049812695

   heads up: we’ve done some major code restructuring in 
https://github.com/apache/spark/pull/45978, so please sync these changes before 
moving on
   
   @miland-db you’ll likely need to rewrite the code in this PR, so please 
follow the guidelines outlined in 
https://issues.apache.org/jira/browse/SPARK-47410


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-08 Thread via GitHub


dbatomic commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1555475151


##
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##
@@ -549,6 +549,51 @@ public int findInSet(UTF8String match) {
 return 0;
   }
 
+  public int findInSet(UTF8String match, int collationId) {
+if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+  return this.findInSet(match);
+}
+if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) {

Review Comment:
   @cloud-fan - Let's put on hold string expression development until we 
provide proper design. @uros-db, @miland-db and I will follow up on this today.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-08 Thread via GitHub


cloud-fan commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1555443962


##
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##
@@ -549,6 +549,51 @@ public int findInSet(UTF8String match) {
 return 0;
   }
 
+  public int findInSet(UTF8String match, int collationId) {
+if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+  return this.findInSet(match);
+}
+if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) {

Review Comment:
   "lowercase collator" SGTM



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-08 Thread via GitHub


miland-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1555394471


##
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##
@@ -549,6 +549,51 @@ public int findInSet(UTF8String match) {
 return 0;
   }
 
+  public int findInSet(UTF8String match, int collationId) {
+if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+  return this.findInSet(match);
+}
+if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) {

Review Comment:
   We can solve this in the same way we did i t here: 
[STRING_LOCATE](https://github.com/apache/spark/pull/45791). In the next PRs we 
updated `CollationFactory` to be able to return "lowercase collator" so we can 
unify the way we deal with collated strings. If you prefer, I can update this 
PR with that change.



##
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##
@@ -549,6 +549,51 @@ public int findInSet(UTF8String match) {
 return 0;
   }
 
+  public int findInSet(UTF8String match, int collationId) {
+if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+  return this.findInSet(match);
+}
+if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) {

Review Comment:
   We can solve this in the same way we did it here: 
[STRING_LOCATE](https://github.com/apache/spark/pull/45791). In the next PRs we 
updated `CollationFactory` to be able to return "lowercase collator" so we can 
unify the way we deal with collated strings. If you prefer, I can update this 
PR with that change.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-08 Thread via GitHub


dbatomic commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1555389826


##
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##
@@ -549,6 +549,51 @@ public int findInSet(UTF8String match) {
 return 0;
   }
 
+  public int findInSet(UTF8String match, int collationId) {
+if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+  return this.findInSet(match);
+}
+if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) {

Review Comment:
   That being said, I also don't like current direction of pushing everything 
into `UTF8String`.  Let me see if we can come up with some cleaner approach.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-08 Thread via GitHub


dbatomic commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1555378573


##
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##
@@ -549,6 +549,51 @@ public int findInSet(UTF8String match) {
 return 0;
   }
 
+  public int findInSet(UTF8String match, int collationId) {
+if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+  return this.findInSet(match);
+}
+if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) {

Review Comment:
   We don't have to expect more special collations besides `UTF8_BINARY_LCASE`. 
Everything else will be based on ICU.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-07 Thread via GitHub


cloud-fan commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1555173817


##
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##
@@ -549,6 +549,51 @@ public int findInSet(UTF8String match) {
 return 0;
   }
 
+  public int findInSet(UTF8String match, int collationId) {
+if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+  return this.findInSet(match);
+}
+if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) {

Review Comment:
   @dbatomic do we have a general principle for this special collation? Always 
lower-case first and then reuse existing UTF8String functions? Will we have 
more collations like this in the future?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-05 Thread via GitHub


miland-db commented on PR #45643:
URL: https://github.com/apache/spark/pull/45643#issuecomment-2039386033

   @MaxGekk can you please review this?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-04 Thread via GitHub


miland-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1549579283


##
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##
@@ -549,6 +549,45 @@ public int findInSet(UTF8String match) {
 return 0;
   }
 
+  public int findInSet(UTF8String match, int collationId) {
+if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+  return this.findInSet(match);
+}
+if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) {
+  return this.toLowerCase().findInSet(match.toLowerCase());
+}
+return collationAwareFindInSet(match, collationId);
+}
+
+  private int collationAwareFindInSet(UTF8String match, int collationId) {
+if (match.contains(COMMA_UTF8)) {
+  return 0;
+}
+
+StringSearch stringSearch = CollationFactory.getStringSearch(this, match, 
collationId);
+
+String setString = this.toString();
+int wordStart = 0;
+while ((wordStart = stringSearch.next()) != StringSearch.DONE) {
+  boolean isValidStart = wordStart == 0 || setString.charAt(wordStart - 1) 
== ',';
+  boolean isValidEnd = wordStart + stringSearch.getMatchLength() == 
setString.length()
+  || setString.charAt(wordStart + stringSearch.getMatchLength()) 
== ',';
+
+  if (isValidStart && isValidEnd) {
+int pos = 0;
+for (int i = 0; i < setString.length() && i < wordStart; i++) {
+  if (setString.charAt(i) == ',') {
+pos++;
+  }
+}
+
+return pos + 1;

Review Comment:
   I think it is specific for this case because we don't need methods like this 
in other places:
   `private int countBefore(UTF8String pattern, int collationId, int end)`
   `private int countAfter(UTF8String pattern, int collationId, int start)`
   
   Maybe if we encounter such a case, we can adapt code then.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-04 Thread via GitHub


miland-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1551227337


##
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala:
##
@@ -994,15 +994,29 @@ case class StringTranslate(srcExpr: Expression, 
matchingExpr: Expression, replac
 case class FindInSet(left: Expression, right: Expression) extends 
BinaryExpression
 with ImplicitCastInputTypes with NullIntolerant {
 
-  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType)
+  final lazy val collationId: Int = 
left.dataType.asInstanceOf[StringType].collationId
+
+  override def inputTypes: Seq[AbstractDataType] =
+Seq(StringTypeAnyCollation, StringTypeAnyCollation)
 
-  override protected def nullSafeEval(word: Any, set: Any): Any =
-set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String])
+  override protected def nullSafeEval(word: Any, set: Any): Any = {
+if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+  set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String])
+} else {
+  set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String], 
collationId)
+}
+  }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-nullSafeCodeGen(ctx, ev, (word, set) =>
-  s"${ev.value} = $set.findInSet($word);"
-)
+if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+  nullSafeCodeGen(ctx, ev, (word, set) => s"${ev.value} = 
$set.findInSet($word);")
+} else {
+  nullSafeCodeGen(ctx, ev, (word, set) => s"${ev.value} = 
$set.findInSet($word, $collationId);")
+}
+  }
+
+  override def checkInputDataTypes(): TypeCheckResult = {
+super.checkInputDataTypes()
   }

Review Comment:
   Ok, removed!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-04 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1551195271


##
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala:
##
@@ -994,15 +994,29 @@ case class StringTranslate(srcExpr: Expression, 
matchingExpr: Expression, replac
 case class FindInSet(left: Expression, right: Expression) extends 
BinaryExpression
 with ImplicitCastInputTypes with NullIntolerant {
 
-  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType)
+  final lazy val collationId: Int = 
left.dataType.asInstanceOf[StringType].collationId
+
+  override def inputTypes: Seq[AbstractDataType] =
+Seq(StringTypeAnyCollation, StringTypeAnyCollation)
 
-  override protected def nullSafeEval(word: Any, set: Any): Any =
-set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String])
+  override protected def nullSafeEval(word: Any, set: Any): Any = {
+if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+  set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String])
+} else {
+  set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String], 
collationId)
+}
+  }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-nullSafeCodeGen(ctx, ev, (word, set) =>
-  s"${ev.value} = $set.findInSet($word);"
-)
+if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+  nullSafeCodeGen(ctx, ev, (word, set) => s"${ev.value} = 
$set.findInSet($word);")
+} else {
+  nullSafeCodeGen(ctx, ev, (word, set) => s"${ev.value} = 
$set.findInSet($word, $collationId);")
+}
+  }
+
+  override def checkInputDataTypes(): TypeCheckResult = {
+super.checkInputDataTypes()
   }

Review Comment:
   I don't think this is needed anymore (& elsewhere too)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-04 Thread via GitHub


miland-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1551132518


##
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala:
##
@@ -1002,15 +1002,34 @@ case class StringTranslate(srcExpr: Expression, 
matchingExpr: Expression, replac
 case class FindInSet(left: Expression, right: Expression) extends 
BinaryExpression
 with ImplicitCastInputTypes with NullIntolerant {
 
-  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType)
+  final lazy val collationId: Int = 
left.dataType.asInstanceOf[StringType].collationId
 
-  override protected def nullSafeEval(word: Any, set: Any): Any =
-set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String])
+  override def inputTypes: Seq[AbstractDataType] =
+Seq(StringTypeAnyCollation, StringTypeAnyCollation)
+
+  override protected def nullSafeEval(word: Any, set: Any): Any = {
+if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+  set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String])
+} else {
+  set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String], 
collationId)
+}
+  }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-nullSafeCodeGen(ctx, ev, (word, set) =>
-  s"${ev.value} = $set.findInSet($word);"
-)
+if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+  nullSafeCodeGen(ctx, ev, (word, set) => s"${ev.value} = 
$set.findInSet($word);")
+} else {
+  nullSafeCodeGen(ctx, ev, (word, set) => s"${ev.value} = 
$set.findInSet($word, $collationId);")
+}
+  }
+
+  override def checkInputDataTypes(): TypeCheckResult = {
+val defaultCheck = super.checkInputDataTypes()
+if (defaultCheck.isFailure) {
+  return defaultCheck
+}
+
+CollationTypeConstraints.checkCollationCompatibility(collationId, 
children.map(_.dataType))

Review Comment:
   Sure, will do



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-03 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1550968894


##
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala:
##
@@ -1002,15 +1002,34 @@ case class StringTranslate(srcExpr: Expression, 
matchingExpr: Expression, replac
 case class FindInSet(left: Expression, right: Expression) extends 
BinaryExpression
 with ImplicitCastInputTypes with NullIntolerant {
 
-  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType)
+  final lazy val collationId: Int = 
left.dataType.asInstanceOf[StringType].collationId
 
-  override protected def nullSafeEval(word: Any, set: Any): Any =
-set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String])
+  override def inputTypes: Seq[AbstractDataType] =
+Seq(StringTypeAnyCollation, StringTypeAnyCollation)
+
+  override protected def nullSafeEval(word: Any, set: Any): Any = {
+if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+  set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String])
+} else {
+  set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String], 
collationId)
+}
+  }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-nullSafeCodeGen(ctx, ev, (word, set) =>
-  s"${ev.value} = $set.findInSet($word);"
-)
+if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+  nullSafeCodeGen(ctx, ev, (word, set) => s"${ev.value} = 
$set.findInSet($word);")
+} else {
+  nullSafeCodeGen(ctx, ev, (word, set) => s"${ev.value} = 
$set.findInSet($word, $collationId);")
+}
+  }
+
+  override def checkInputDataTypes(): TypeCheckResult = {
+val defaultCheck = super.checkInputDataTypes()
+if (defaultCheck.isFailure) {
+  return defaultCheck
+}
+
+CollationTypeConstraints.checkCollationCompatibility(collationId, 
children.map(_.dataType))

Review Comment:
   heads up: [implicit cast](https://github.com/apache/spark/pull/45383) is now 
live
   please sync your fork and rebase your changes accordingly



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-03 Thread via GitHub


miland-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1549579283


##
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##
@@ -549,6 +549,45 @@ public int findInSet(UTF8String match) {
 return 0;
   }
 
+  public int findInSet(UTF8String match, int collationId) {
+if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+  return this.findInSet(match);
+}
+if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) {
+  return this.toLowerCase().findInSet(match.toLowerCase());
+}
+return collationAwareFindInSet(match, collationId);
+}
+
+  private int collationAwareFindInSet(UTF8String match, int collationId) {
+if (match.contains(COMMA_UTF8)) {
+  return 0;
+}
+
+StringSearch stringSearch = CollationFactory.getStringSearch(this, match, 
collationId);
+
+String setString = this.toString();
+int wordStart = 0;
+while ((wordStart = stringSearch.next()) != StringSearch.DONE) {
+  boolean isValidStart = wordStart == 0 || setString.charAt(wordStart - 1) 
== ',';
+  boolean isValidEnd = wordStart + stringSearch.getMatchLength() == 
setString.length()
+  || setString.charAt(wordStart + stringSearch.getMatchLength()) 
== ',';
+
+  if (isValidStart && isValidEnd) {
+int pos = 0;
+for (int i = 0; i < setString.length() && i < wordStart; i++) {
+  if (setString.charAt(i) == ',') {
+pos++;
+  }
+}
+
+return pos + 1;

Review Comment:
   I think it is specific for this case because we don't need methods like in 
other places:
   `private int countBefore(UTF8String pattern, int collationId, int end)`
   `private int countAfter(UTF8String pattern, int collationId, int start)`
   
   Maybe if we encounter such a case, we can adapt code then.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-03 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1549419980


##
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##
@@ -549,6 +549,45 @@ public int findInSet(UTF8String match) {
 return 0;
   }
 
+  public int findInSet(UTF8String match, int collationId) {

Review Comment:
   I think we now have a lot of new methods 
   consider adding comments such as: 
https://github.com/apache/spark/pull/45791/files#r1549415554



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-03 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1549393104


##
sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala:
##
@@ -73,17 +74,78 @@ class CollationStringExpressionsSuite extends QueryTest
 })
   }
 
+  test("INSTR check result on explicitly collated strings") {

Review Comment:
   try covering some more edge cases, such as: empty strings, uppercase and 
lowercase mix, different byte-length chars, etc.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-03 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1549393104


##
sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala:
##
@@ -73,17 +74,78 @@ class CollationStringExpressionsSuite extends QueryTest
 })
   }
 
+  test("INSTR check result on explicitly collated strings") {

Review Comment:
   (goes for all tests) try covering some more edge cases, such as: empty 
strings, uppercase and lowercase mix, different byte-length chars, etc.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-03 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1549387717


##
sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala:
##
@@ -73,17 +74,78 @@ class CollationStringExpressionsSuite extends QueryTest
 })
   }
 
+  test("INSTR check result on explicitly collated strings") {
+def testInStr(str: String, substr: String, collationId: Integer, expected: 
Integer): Unit = {
+  val string = Literal.create(str, StringType(collationId))
+  val substring = Literal.create(substr, StringType(collationId))
+
+  checkEvaluation(StringInstr(string, substring), expected)
+}
+
+var collationId = CollationFactory.collationNameToId("UTF8_BINARY_LCASE")

Review Comment:
   since you have some "UTF8_BINARY" tests down below, consider adding a couple 
here as well



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-03 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1549378473


##
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##
@@ -549,6 +549,45 @@ public int findInSet(UTF8String match) {
 return 0;
   }
 
+  public int findInSet(UTF8String match, int collationId) {
+if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+  return this.findInSet(match);
+}
+if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) {
+  return this.toLowerCase().findInSet(match.toLowerCase());
+}
+return collationAwareFindInSet(match, collationId);
+}
+
+  private int collationAwareFindInSet(UTF8String match, int collationId) {
+if (match.contains(COMMA_UTF8)) {
+  return 0;
+}
+
+StringSearch stringSearch = CollationFactory.getStringSearch(this, match, 
collationId);
+
+String setString = this.toString();
+int wordStart = 0;
+while ((wordStart = stringSearch.next()) != StringSearch.DONE) {
+  boolean isValidStart = wordStart == 0 || setString.charAt(wordStart - 1) 
== ',';
+  boolean isValidEnd = wordStart + stringSearch.getMatchLength() == 
setString.length()
+  || setString.charAt(wordStart + stringSearch.getMatchLength()) 
== ',';
+
+  if (isValidStart && isValidEnd) {
+int pos = 0;
+for (int i = 0; i < setString.length() && i < wordStart; i++) {
+  if (setString.charAt(i) == ',') {
+pos++;
+  }
+}
+
+return pos + 1;

Review Comment:
   if this piece of code is just counting commas, consider using a separate 
java function for this
   if such function doesn't currently exist, perhaps you could modularize this 
code a bit and introduce it



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]

2024-04-03 Thread via GitHub


uros-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1549356302


##
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##
@@ -549,6 +549,45 @@ public int findInSet(UTF8String match) {
 return 0;
   }
 
+  public int findInSet(UTF8String match, int collationId) {
+if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+  return this.findInSet(match);
+}
+if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) {
+  return this.toLowerCase().findInSet(match.toLowerCase());
+}
+return collationAwareFindInSet(match, collationId);
+}

Review Comment:
   fix indentation



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org