Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1570050123 ## common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java: ## @@ -261,6 +261,84 @@ public void testEndsWith() throws SparkException { assertEndsWith("The i̇o", "İo", "UNICODE_CI", true); } + private void assertStringInstr(String string, String substring, String collationName, + Integer expected) throws SparkException { +UTF8String str = UTF8String.fromString(string); +UTF8String substr = UTF8String.fromString(substring); +int collationId = CollationFactory.collationNameToId(collationName); +assertEquals(expected, CollationSupport.StringInstr.exec(str, substr, collationId) + 1); + } + + @Test + public void testStringInstr() throws SparkException { +assertStringInstr("aaads", "Aa", "UTF8_BINARY", 0); +assertStringInstr("aaaDs", "de", "UTF8_BINARY", 0); +assertStringInstr("aaads", "ds", "UTF8_BINARY", 4); +assertStringInstr("", "", "UTF8_BINARY", 1); +assertStringInstr("", "", "UTF8_BINARY", 0); +assertStringInstr("test大千世界X大千世界", "大千", "UTF8_BINARY", 5); +assertStringInstr("test大千世界X大千世界", "界X", "UTF8_BINARY", 8); +assertStringInstr("aaads", "Aa", "UTF8_BINARY_LCASE", 1); +assertStringInstr("aaaDs", "de", "UTF8_BINARY_LCASE", 0); +assertStringInstr("aaaDs", "ds", "UTF8_BINARY_LCASE", 4); +assertStringInstr("", "", "UTF8_BINARY_LCASE", 1); +assertStringInstr("", "", "UTF8_BINARY_LCASE", 0); +assertStringInstr("test大千世界X大千世界", "大千", "UTF8_BINARY_LCASE", 5); +assertStringInstr("test大千世界X大千世界", "界x", "UTF8_BINARY_LCASE", 8); +assertStringInstr("aaads", "Aa", "UNICODE", 0); +assertStringInstr("aaads", "aa", "UNICODE", 1); +assertStringInstr("aaads", "de", "UNICODE", 0); +assertStringInstr("", "", "UNICODE", 1); +assertStringInstr("", "", "UNICODE", 0); +assertStringInstr("test大千世界X大千世界", "界x", "UNICODE", 0); +assertStringInstr("test大千世界X大千世界", "界X", "UNICODE", 8); +assertStringInstr("aaads", "AD", "UNICODE_CI", 3); +assertStringInstr("aaads", "dS", "UNICODE_CI", 4); +assertStringInstr("test大千世界X大千世界", "界y", "UNICODE_CI", 0); +assertStringInstr("test大千世界X大千世界", "界x", "UNICODE_CI", 8); + } Review Comment: ```suggestion assertStringInstr("abİo12", "i̇o", "UNICODE_CI", 3); assertStringInstr("abi̇o12", "İo", "UNICODE_CI", 3); } ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1570050746 ## common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java: ## @@ -261,6 +261,84 @@ public void testEndsWith() throws SparkException { assertEndsWith("The i̇o", "İo", "UNICODE_CI", true); } + private void assertStringInstr(String string, String substring, String collationName, + Integer expected) throws SparkException { +UTF8String str = UTF8String.fromString(string); +UTF8String substr = UTF8String.fromString(substring); +int collationId = CollationFactory.collationNameToId(collationName); +assertEquals(expected, CollationSupport.StringInstr.exec(str, substr, collationId) + 1); + } + + @Test + public void testStringInstr() throws SparkException { +assertStringInstr("aaads", "Aa", "UTF8_BINARY", 0); +assertStringInstr("aaaDs", "de", "UTF8_BINARY", 0); +assertStringInstr("aaads", "ds", "UTF8_BINARY", 4); +assertStringInstr("", "", "UTF8_BINARY", 1); +assertStringInstr("", "", "UTF8_BINARY", 0); +assertStringInstr("test大千世界X大千世界", "大千", "UTF8_BINARY", 5); +assertStringInstr("test大千世界X大千世界", "界X", "UTF8_BINARY", 8); +assertStringInstr("aaads", "Aa", "UTF8_BINARY_LCASE", 1); +assertStringInstr("aaaDs", "de", "UTF8_BINARY_LCASE", 0); +assertStringInstr("aaaDs", "ds", "UTF8_BINARY_LCASE", 4); +assertStringInstr("", "", "UTF8_BINARY_LCASE", 1); +assertStringInstr("", "", "UTF8_BINARY_LCASE", 0); +assertStringInstr("test大千世界X大千世界", "大千", "UTF8_BINARY_LCASE", 5); +assertStringInstr("test大千世界X大千世界", "界x", "UTF8_BINARY_LCASE", 8); +assertStringInstr("aaads", "Aa", "UNICODE", 0); +assertStringInstr("aaads", "aa", "UNICODE", 1); +assertStringInstr("aaads", "de", "UNICODE", 0); +assertStringInstr("", "", "UNICODE", 1); +assertStringInstr("", "", "UNICODE", 0); +assertStringInstr("test大千世界X大千世界", "界x", "UNICODE", 0); +assertStringInstr("test大千世界X大千世界", "界X", "UNICODE", 8); +assertStringInstr("aaads", "AD", "UNICODE_CI", 3); +assertStringInstr("aaads", "dS", "UNICODE_CI", 4); +assertStringInstr("test大千世界X大千世界", "界y", "UNICODE_CI", 0); +assertStringInstr("test大千世界X大千世界", "界x", "UNICODE_CI", 8); + } + + private void assertFindInSet(String word, String set, String collationName, +Integer expected) throws SparkException { +UTF8String w = UTF8String.fromString(word); +UTF8String s = UTF8String.fromString(set); +int collationId = CollationFactory.collationNameToId(collationName); +assertEquals(expected, CollationSupport.FindInSet.exec(w, s, collationId)); + } + + @Test + public void testFindInSet() throws SparkException { +assertFindInSet("AB", "abc,b,ab,c,def", "UTF8_BINARY", 0); +assertFindInSet("abc", "abc,b,ab,c,def", "UTF8_BINARY", 1); +assertFindInSet("def", "abc,b,ab,c,def", "UTF8_BINARY", 5); +assertFindInSet("d,ef", "abc,b,ab,c,def", "UTF8_BINARY", 0); +assertFindInSet("", "abc,b,ab,c,def", "UTF8_BINARY", 0); +assertFindInSet("a", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 0); +assertFindInSet("c", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 4); +assertFindInSet("AB", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 3); +assertFindInSet("AbC", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 1); +assertFindInSet("abcd", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 0); +assertFindInSet("d,ef", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 0); +assertFindInSet("XX", "xx", "UTF8_BINARY_LCASE", 1); +assertFindInSet("", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 0); +assertFindInSet("界x", "test,大千,世,界X,大,千,世界", "UTF8_BINARY_LCASE", 4); +assertFindInSet("a", "abc,b,ab,c,def", "UNICODE", 0); +assertFindInSet("ab", "abc,b,ab,c,def", "UNICODE", 3); +assertFindInSet("Ab", "abc,b,ab,c,def", "UNICODE", 0); +assertFindInSet("d,ef", "abc,b,ab,c,def", "UNICODE", 0); +assertFindInSet("xx", "xx", "UNICODE", 1); +assertFindInSet("界x", "test,大千,世,界X,大,千,世界", "UNICODE", 0); +assertFindInSet("大", "test,大千,世,界X,大,千,世界", "UNICODE", 5); +assertFindInSet("a", "abc,b,ab,c,def", "UNICODE_CI", 0); +assertFindInSet("C", "abc,b,ab,c,def", "UNICODE_CI", 4); +assertFindInSet("DeF", "abc,b,ab,c,dEf", "UNICODE_CI", 5); +assertFindInSet("DEFG", "abc,b,ab,c,def", "UNICODE_CI", 0); +assertFindInSet("XX", "xx", "UNICODE_CI", 1); +assertFindInSet("界x", "test,大千,世,界X,大,千,世界", "UNICODE_CI", 4); +assertFindInSet("界x", "test,大千,界Xx,世,界X,大,千,世界", "UNICODE_CI", 5); +assertFindInSet("大", "test,大千,世,界X,大,千,世界", "UNICODE_CI", 5); + } Review Comment: (same new test cases as above) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service,
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1570041252 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala: ## @@ -18,9 +18,7 @@ package org.apache.spark.sql.catalyst.analysis import javax.annotation.Nullable - import scala.annotation.tailrec - Review Comment: undo these changes -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
miland-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565868020 ## sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala: ## @@ -17,13 +17,11 @@ package org.apache.spark.sql -import scala.collection.immutable.Seq - Review Comment: Reverted the change -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565859516 ## sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala: ## @@ -17,13 +17,11 @@ package org.apache.spark.sql -import scala.collection.immutable.Seq - Review Comment: are you sure that's the reason? If so, go ahead and remove it but I do wonder why this only comes up now, it doesn't have to do with your changes -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
miland-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565854152 ## sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala: ## @@ -17,13 +17,11 @@ package org.apache.spark.sql -import scala.collection.immutable.Seq - Review Comment: It will not pass the tests because of unused import -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565850753 ## sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala: ## @@ -17,13 +17,11 @@ package org.apache.spark.sql -import scala.collection.immutable.Seq - Review Comment: let's avoid unnecessary changes -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
miland-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565815937 ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java: ## @@ -138,72 +138,72 @@ public static boolean execICU(final UTF8String l, final UTF8String r, } public static class FindInSet { -public static int exec(final UTF8String l, final UTF8String r, final int collationId) { +public static int exec(final UTF8String word, final UTF8String set, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); if (collation.supportsBinaryEquality) { -return execBinary(l, r); +return execBinary(word, set); } else if (collation.supportsLowercaseEquality) { -return execLowercase(l, r); +return execLowercase(word, set); } else { -return execICU(l, r, collationId); +return execICU(word, set, collationId); } } -public static String genCode(final String l, final String r, final int collationId) { +public static String genCode(final String word, final String set, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); String expr = "CollationSupport.FindInSet.exec"; if (collation.supportsBinaryEquality) { -return String.format(expr + "Binary(%s, %s)", l, r); +return String.format(expr + "Binary(%s, %s)", word, set); } else if (collation.supportsLowercaseEquality) { -return String.format(expr + "Lowercase(%s, %s)", l, r); +return String.format(expr + "Lowercase(%s, %s)", word, set); } else { -return String.format(expr + "ICU(%s, %s, %d)", l, r, collationId); +return String.format(expr + "ICU(%s, %s, %d)", word, set, collationId); } } -public static int execBinary(final UTF8String l, final UTF8String r) { - return r.findInSet(l); +public static int execBinary(final UTF8String word, final UTF8String set) { + return set.findInSet(word); } -public static int execLowercase(final UTF8String l, final UTF8String r) { - return r.toLowerCase().findInSet(l.toLowerCase()); +public static int execLowercase(final UTF8String word, final UTF8String set) { + return set.toLowerCase().findInSet(word.toLowerCase()); } -public static int execICU(final UTF8String l, final UTF8String r, +public static int execICU(final UTF8String word, final UTF8String set, final int collationId) { - return CollationAwareUTF8String.findInSet(l, r, collationId); + return CollationAwareUTF8String.findInSet(word, set, collationId); } } - public static class IndexOf { -public static int exec(final UTF8String l, final UTF8String r, final int start, + public static class StringInstr { +public static int exec(final UTF8String string, final UTF8String substring, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); if (collation.supportsBinaryEquality) { -return execBinary(l, r, start); +return execBinary(string, substring); } else if (collation.supportsLowercaseEquality) { -return execLowercase(l, r, start); +return execLowercase(string, substring); } else { -return execICU(l, r, start, collationId); +return execICU(string, substring, collationId); } } -public static String genCode(final String l, final String r, final int start, +public static String genCode(final String string, final String substring, final int start, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); - String expr = "CollationSupport.IndexOf.exec"; + String expr = "CollationSupport.StringInstr.exec"; if (collation.supportsBinaryEquality) { -return String.format(expr + "Binary(%s, %s, %d)", l, r, start); +return String.format(expr + "Binary(%s, %s, %d)", string, substring, start); } else if (collation.supportsLowercaseEquality) { -return String.format(expr + "Lowercase(%s, %s, %d)", l, r, start); +return String.format(expr + "Lowercase(%s, %s, %d)", string, substring, start); } else { -return String.format(expr + "ICU(%s, %s, %d, %d)", l, r, start, collationId); +return String.format(expr + "ICU(%s, %s, %d, %d)", string, substring, start, collationId); } } -public static int execBinary(final UTF8String l, final UTF8String r, final int start) { - return l.indexOf(r, start); +public static int execBinary(final UTF8String string, final UTF8String substring) { + return string.indexOf(substring, 0); } -public static int execLowercase(final UTF8String l,
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565772062 ## common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java: ## @@ -249,6 +249,86 @@ public void testEndsWith() throws SparkException { assertEndsWith("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false); } + private void assertStringInstr(String string, String substring, String collationName, + Integer value) throws SparkException { +UTF8String str = UTF8String.fromString(string); +UTF8String substr = UTF8String.fromString(substring); +int collationId = CollationFactory.collationNameToId(collationName); + +assertEquals(CollationSupport.StringInstr.exec(str, substr, collationId) + 1, value); + } + + @Test + public void testStringInstr() throws SparkException { +assertStringInstr("aaads", "Aa", "UTF8_BINARY", 0); +assertStringInstr("aaaDs", "de", "UTF8_BINARY", 0); +assertStringInstr("aaads", "ds", "UTF8_BINARY", 4); +assertStringInstr("", "", "UTF8_BINARY", 1); +assertStringInstr("", "", "UTF8_BINARY", 0); +assertStringInstr("test大千世界X大千世界", "大千", "UTF8_BINARY", 5); +assertStringInstr("test大千世界X大千世界", "界X", "UTF8_BINARY", 8); +assertStringInstr("aaads", "Aa", "UTF8_BINARY_LCASE", 1); +assertStringInstr("aaaDs", "de", "UTF8_BINARY_LCASE", 0); +assertStringInstr("aaaDs", "ds", "UTF8_BINARY_LCASE", 4); +assertStringInstr("", "", "UTF8_BINARY_LCASE", 1); +assertStringInstr("", "", "UTF8_BINARY_LCASE", 0); +assertStringInstr("test大千世界X大千世界", "大千", "UTF8_BINARY_LCASE", 5); +assertStringInstr("test大千世界X大千世界", "界x", "UTF8_BINARY_LCASE", 8); +assertStringInstr("aaads", "Aa", "UNICODE", 0); +assertStringInstr("aaads", "aa", "UNICODE", 1); +assertStringInstr("aaads", "de", "UNICODE", 0); +assertStringInstr("", "", "UNICODE", 1); +assertStringInstr("", "", "UNICODE", 0); +assertStringInstr("test大千世界X大千世界", "界x", "UNICODE", 0); +assertStringInstr("test大千世界X大千世界", "界X", "UNICODE", 8); +assertStringInstr("aaads", "AD", "UNICODE_CI", 3); +assertStringInstr("aaads", "dS", "UNICODE_CI", 4); +assertStringInstr("test大千世界X大千世界", "界x", "UNICODE_CI", 8); + } + + //word: String, set: String, collationId: Integer, expected: Integer + private void assertFindInSet(String word, String set, String collationName, + Integer value) throws SparkException { +UTF8String w = UTF8String.fromString(word); +UTF8String s = UTF8String.fromString(set); +int collationId = CollationFactory.collationNameToId(collationName); + +assertEquals(CollationSupport.FindInSet.exec(w, s, collationId), value); Review Comment: ```suggestion assertEquals(expected, CollationSupport.FindInSet.exec(w, s, collationId)); ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565771738 ## common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java: ## @@ -249,6 +249,86 @@ public void testEndsWith() throws SparkException { assertEndsWith("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false); } + private void assertStringInstr(String string, String substring, String collationName, + Integer value) throws SparkException { +UTF8String str = UTF8String.fromString(string); +UTF8String substr = UTF8String.fromString(substring); +int collationId = CollationFactory.collationNameToId(collationName); + +assertEquals(CollationSupport.StringInstr.exec(str, substr, collationId) + 1, value); Review Comment: ```suggestion assertEquals(expected, CollationSupport.StringInstr.exec(str, substr, collationId) + 1); ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565770706 ## common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java: ## @@ -249,6 +249,86 @@ public void testEndsWith() throws SparkException { assertEndsWith("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false); } + private void assertStringInstr(String string, String substring, String collationName, Review Comment: this is important: Vladimir noticed that we've been using assertEquals incorrectly in `CollationSupportSuite.java` please fix this according to his PR changes: https://github.com/apache/spark/pull/46058 since all of the asserts pass here, we wouldn't notice this, but the correct order is: first **expected** value, and then **received** value -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565766846 ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java: ## @@ -138,72 +138,72 @@ public static boolean execICU(final UTF8String l, final UTF8String r, } public static class FindInSet { -public static int exec(final UTF8String l, final UTF8String r, final int collationId) { +public static int exec(final UTF8String word, final UTF8String set, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); if (collation.supportsBinaryEquality) { -return execBinary(l, r); +return execBinary(word, set); } else if (collation.supportsLowercaseEquality) { -return execLowercase(l, r); +return execLowercase(word, set); } else { -return execICU(l, r, collationId); +return execICU(word, set, collationId); } } -public static String genCode(final String l, final String r, final int collationId) { +public static String genCode(final String word, final String set, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); String expr = "CollationSupport.FindInSet.exec"; if (collation.supportsBinaryEquality) { -return String.format(expr + "Binary(%s, %s)", l, r); +return String.format(expr + "Binary(%s, %s)", word, set); } else if (collation.supportsLowercaseEquality) { -return String.format(expr + "Lowercase(%s, %s)", l, r); +return String.format(expr + "Lowercase(%s, %s)", word, set); } else { -return String.format(expr + "ICU(%s, %s, %d)", l, r, collationId); +return String.format(expr + "ICU(%s, %s, %d)", word, set, collationId); } } -public static int execBinary(final UTF8String l, final UTF8String r) { - return r.findInSet(l); +public static int execBinary(final UTF8String word, final UTF8String set) { + return set.findInSet(word); } -public static int execLowercase(final UTF8String l, final UTF8String r) { - return r.toLowerCase().findInSet(l.toLowerCase()); +public static int execLowercase(final UTF8String word, final UTF8String set) { + return set.toLowerCase().findInSet(word.toLowerCase()); } -public static int execICU(final UTF8String l, final UTF8String r, +public static int execICU(final UTF8String word, final UTF8String set, final int collationId) { - return CollationAwareUTF8String.findInSet(l, r, collationId); + return CollationAwareUTF8String.findInSet(word, set, collationId); } } - public static class IndexOf { -public static int exec(final UTF8String l, final UTF8String r, final int start, + public static class StringInstr { +public static int exec(final UTF8String string, final UTF8String substring, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); if (collation.supportsBinaryEquality) { -return execBinary(l, r, start); +return execBinary(string, substring); } else if (collation.supportsLowercaseEquality) { -return execLowercase(l, r, start); +return execLowercase(string, substring); } else { -return execICU(l, r, start, collationId); +return execICU(string, substring, collationId); } } -public static String genCode(final String l, final String r, final int start, +public static String genCode(final String string, final String substring, final int start, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); - String expr = "CollationSupport.IndexOf.exec"; + String expr = "CollationSupport.StringInstr.exec"; if (collation.supportsBinaryEquality) { -return String.format(expr + "Binary(%s, %s, %d)", l, r, start); +return String.format(expr + "Binary(%s, %s, %d)", string, substring, start); } else if (collation.supportsLowercaseEquality) { -return String.format(expr + "Lowercase(%s, %s, %d)", l, r, start); +return String.format(expr + "Lowercase(%s, %s, %d)", string, substring, start); } else { -return String.format(expr + "ICU(%s, %s, %d, %d)", l, r, start, collationId); +return String.format(expr + "ICU(%s, %s, %d, %d)", string, substring, start, collationId); } } -public static int execBinary(final UTF8String l, final UTF8String r, final int start) { - return l.indexOf(r, start); +public static int execBinary(final UTF8String string, final UTF8String substring) { + return string.indexOf(substring, 0); } -public static int execLowercase(final UTF8String l, final
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
miland-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565559690 ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java: ## @@ -169,6 +239,46 @@ private static boolean matchAt(final UTF8String target, final UTF8String pattern pos, pos + pattern.numChars()), pattern, collationId).last() == 0; } +private static int findInSet(UTF8String match, UTF8String set, int collationId) { + if (match.contains(UTF8String.fromString(","))) { +return 0; + } + + StringSearch stringSearch = CollationFactory.getStringSearch(set, match, collationId); + + String setString = set.toString(); Review Comment: Added -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565559007 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala: ## @@ -974,15 +974,19 @@ case class StringTranslate(srcExpr: Expression, matchingExpr: Expression, replac case class FindInSet(left: Expression, right: Expression) extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType) + final lazy val collationId: Int = left.dataType.asInstanceOf[StringType].collationId + + override def inputTypes: Seq[AbstractDataType] = +Seq(StringTypeAnyCollation, StringTypeAnyCollation) - override protected def nullSafeEval(word: Any, set: Any): Any = -set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String]) + override protected def nullSafeEval(word: Any, set: Any): Any = { +CollationSupport.FindInSet. + exec(word.asInstanceOf[UTF8String], set.asInstanceOf[UTF8String], collationId) + } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -nullSafeCodeGen(ctx, ev, (word, set) => - s"${ev.value} = $set.findInSet($word);" -) +nullSafeCodeGen(ctx, ev, (l, r) => + s"${ev.value} = " + CollationSupport.FindInSet.genCode(l, r, collationId) + ";") Review Comment: of course, it's just shorthand (see defineCodeGen impl) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
miland-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r156472 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala: ## @@ -974,15 +974,19 @@ case class StringTranslate(srcExpr: Expression, matchingExpr: Expression, replac case class FindInSet(left: Expression, right: Expression) extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType) + final lazy val collationId: Int = left.dataType.asInstanceOf[StringType].collationId + + override def inputTypes: Seq[AbstractDataType] = +Seq(StringTypeAnyCollation, StringTypeAnyCollation) - override protected def nullSafeEval(word: Any, set: Any): Any = -set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String]) + override protected def nullSafeEval(word: Any, set: Any): Any = { +CollationSupport.FindInSet. + exec(word.asInstanceOf[UTF8String], set.asInstanceOf[UTF8String], collationId) + } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -nullSafeCodeGen(ctx, ev, (word, set) => - s"${ev.value} = $set.findInSet($word);" -) +nullSafeCodeGen(ctx, ev, (l, r) => + s"${ev.value} = " + CollationSupport.FindInSet.genCode(l, r, collationId) + ";") Review Comment: It works, but I am not sure if we should go from `nullSafeCodeGen` to just `defineCodeGen`? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565531354 ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java: ## @@ -169,6 +239,46 @@ private static boolean matchAt(final UTF8String target, final UTF8String pattern pos, pos + pattern.numChars()), pattern, collationId).last() == 0; } +private static int findInSet(UTF8String match, UTF8String set, int collationId) { + if (match.contains(UTF8String.fromString(","))) { +return 0; + } + + StringSearch stringSearch = CollationFactory.getStringSearch(set, match, collationId); + + String setString = set.toString(); Review Comment: and if you do that, I'd say that the UTF8String implementation for `getStringSearch` can amount to: ``` public static StringSearch getStringSearch( final UTF8String left, final UTF8String right, final int collationId) { return getStringSearch(left.toString(), right.toString(), collationId); } ``` this way, we can have a more complete API -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565524362 ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java: ## @@ -169,6 +239,46 @@ private static boolean matchAt(final UTF8String target, final UTF8String pattern pos, pos + pattern.numChars()), pattern, collationId).last() == 0; } +private static int findInSet(UTF8String match, UTF8String set, int collationId) { + if (match.contains(UTF8String.fromString(","))) { +return 0; + } + + StringSearch stringSearch = CollationFactory.getStringSearch(set, match, collationId); + + String setString = set.toString(); Review Comment: I think `CollationFactory.getStringSearch(set, match, collationId)` already does `set.toString()` there's no need to do that twice, let's add: ``` public static StringSearch getStringSearch( final String targetString, final String pattern, final int collationId) { CharacterIterator target = new StringCharacterIterator(targetString); Collator collator = CollationFactory.fetchCollation(collationId).collator; return new StringSearch(pattern, target, (RuleBasedCollator) collator); } ``` to `CollationFactory` this way, we can get the stringSearch and setString instances here, without extra String allocations -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565508312 ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java: ## @@ -137,6 +137,76 @@ public static boolean execICU(final UTF8String l, final UTF8String r, } } + public static class FindInSet { +public static int exec(final UTF8String l, final UTF8String r, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { +return execBinary(l, r); + } else if (collation.supportsLowercaseEquality) { +return execLowercase(l, r); + } else { +return execICU(l, r, collationId); + } +} +public static String genCode(final String l, final String r, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.FindInSet.exec"; + if (collation.supportsBinaryEquality) { +return String.format(expr + "Binary(%s, %s)", l, r); + } else if (collation.supportsLowercaseEquality) { +return String.format(expr + "Lowercase(%s, %s)", l, r); + } else { +return String.format(expr + "ICU(%s, %s, %d)", l, r, collationId); + } +} +public static int execBinary(final UTF8String l, final UTF8String r) { + return r.findInSet(l); +} +public static int execLowercase(final UTF8String l, final UTF8String r) { + return r.toLowerCase().findInSet(l.toLowerCase()); +} +public static int execICU(final UTF8String l, final UTF8String r, + final int collationId) { + return CollationAwareUTF8String.findInSet(l, r, collationId); +} + } + + public static class IndexOf { Review Comment: `CollationAwareUTF8String.indexOf` should follow the `UTF8String.indexOf` naming, so that part is fine for example but `CollationSupport.StringInstr` should definitely follow `stringExpressions.StringInstr` naming -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565521994 ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java: ## @@ -169,6 +239,46 @@ private static boolean matchAt(final UTF8String target, final UTF8String pattern pos, pos + pattern.numChars()), pattern, collationId).last() == 0; } +private static int findInSet(UTF8String match, UTF8String set, int collationId) { + if (match.contains(UTF8String.fromString(","))) { +return 0; + } + + StringSearch stringSearch = CollationFactory.getStringSearch(set, match, collationId); + + String setString = set.toString(); Review Comment: I think `CollationFactory.getStringSearch(set, match, collationId)` already does set.toString() there's no need to do that twice, let's add: ``` public static StringSearch getStringSearch( final String targetString, final String pattern, final int collationId) { CharacterIterator target = new StringCharacterIterator(targetString); Collator collator = CollationFactory.fetchCollation(collationId).collator; return new StringSearch(pattern, target, (RuleBasedCollator) collator); } ``` to `CollationFactory` this way, we can get the `stringSearch` and `setString` instances here, without extra String allocations ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java: ## @@ -169,6 +239,46 @@ private static boolean matchAt(final UTF8String target, final UTF8String pattern pos, pos + pattern.numChars()), pattern, collationId).last() == 0; } +private static int findInSet(UTF8String match, UTF8String set, int collationId) { + if (match.contains(UTF8String.fromString(","))) { +return 0; + } + + StringSearch stringSearch = CollationFactory.getStringSearch(set, match, collationId); + + String setString = set.toString(); Review Comment: I think CollationFactory.getStringSearch(set, match, collationId) already does set.toString() there's no need to do that twice, let's add: public static StringSearch getStringSearch( final String targetString, final String pattern, final int collationId) { CharacterIterator target = new StringCharacterIterator(targetString); Collator collator = CollationFactory.fetchCollation(collationId).collator; return new StringSearch(pattern, target, (RuleBasedCollator) collator); } to CollationFactory this way, we can get the stringSearch and setString instances here, without extra String allocations -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565521994 ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java: ## @@ -169,6 +239,46 @@ private static boolean matchAt(final UTF8String target, final UTF8String pattern pos, pos + pattern.numChars()), pattern, collationId).last() == 0; } +private static int findInSet(UTF8String match, UTF8String set, int collationId) { + if (match.contains(UTF8String.fromString(","))) { +return 0; + } + + StringSearch stringSearch = CollationFactory.getStringSearch(set, match, collationId); + + String setString = set.toString(); Review Comment: I think `CollationFactory.getStringSearch(set, match, collationId)` already does set.toString() there's no need to do that twice, let's add: ``` public static StringSearch getStringSearch( final String targetString, final String pattern, final int collationId) { CharacterIterator target = new StringCharacterIterator(targetString); Collator collator = CollationFactory.fetchCollation(collationId).collator; return new StringSearch(pattern, target, (RuleBasedCollator) collator); } ``` to `CollationFactory` this way, we can get the `stringSearch` and `setString` instances here, without extra String allocations -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565521994 ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java: ## @@ -169,6 +239,46 @@ private static boolean matchAt(final UTF8String target, final UTF8String pattern pos, pos + pattern.numChars()), pattern, collationId).last() == 0; } +private static int findInSet(UTF8String match, UTF8String set, int collationId) { + if (match.contains(UTF8String.fromString(","))) { +return 0; + } + + StringSearch stringSearch = CollationFactory.getStringSearch(set, match, collationId); + + String setString = set.toString(); Review Comment: I think `CollationFactory.getStringSearch(set, match, collationId)` already does set.toString() there's no need to do that twice, let's add: ` public static StringSearch getStringSearch( final String target, final String patternString, final int collationId) { CharacterIterator target = new StringCharacterIterator(left.toString()); Collator collator = CollationFactory.fetchCollation(collationId).collator; return new StringSearch(pattern, target, (RuleBasedCollator) collator); } ` to `CollationFactory` this way, we can get the `stringSearch` and `setString` instances here, without extra String allocations ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java: ## @@ -169,6 +239,46 @@ private static boolean matchAt(final UTF8String target, final UTF8String pattern pos, pos + pattern.numChars()), pattern, collationId).last() == 0; } +private static int findInSet(UTF8String match, UTF8String set, int collationId) { + if (match.contains(UTF8String.fromString(","))) { +return 0; + } + + StringSearch stringSearch = CollationFactory.getStringSearch(set, match, collationId); + + String setString = set.toString(); Review Comment: I think `CollationFactory.getStringSearch(set, match, collationId)` already does set.toString() there's no need to do that twice, let's add: ``` public static StringSearch getStringSearch( final String target, final String patternString, final int collationId) { CharacterIterator target = new StringCharacterIterator(left.toString()); Collator collator = CollationFactory.fetchCollation(collationId).collator; return new StringSearch(pattern, target, (RuleBasedCollator) collator); } ``` to `CollationFactory` this way, we can get the `stringSearch` and `setString` instances here, without extra String allocations -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565521994 ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java: ## @@ -169,6 +239,46 @@ private static boolean matchAt(final UTF8String target, final UTF8String pattern pos, pos + pattern.numChars()), pattern, collationId).last() == 0; } +private static int findInSet(UTF8String match, UTF8String set, int collationId) { + if (match.contains(UTF8String.fromString(","))) { +return 0; + } + + StringSearch stringSearch = CollationFactory.getStringSearch(set, match, collationId); + + String setString = set.toString(); Review Comment: I think `CollationFactory.getStringSearch(set, match, collationId)` already does set.toString() there's no need to do that twice, let's add: ` public static StringSearch getStringSearch( final String target, final String patternString, final int collationId) { CharacterIterator target = new StringCharacterIterator(left.toString()); Collator collator = CollationFactory.fetchCollation(collationId).collator; return new StringSearch(pattern, target, (RuleBasedCollator) collator); }` to `CollationFactory` this way, we can get the `stringSearch` and `setString` instances here, without extra String allocations ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java: ## @@ -169,6 +239,46 @@ private static boolean matchAt(final UTF8String target, final UTF8String pattern pos, pos + pattern.numChars()), pattern, collationId).last() == 0; } +private static int findInSet(UTF8String match, UTF8String set, int collationId) { + if (match.contains(UTF8String.fromString(","))) { +return 0; + } + + StringSearch stringSearch = CollationFactory.getStringSearch(set, match, collationId); + + String setString = set.toString(); Review Comment: I think `CollationFactory.getStringSearch(set, match, collationId)` already does set.toString() there's no need to do that twice, let's add: ` public static StringSearch getStringSearch( final String target, final String patternString, final int collationId) { CharacterIterator target = new StringCharacterIterator(left.toString()); Collator collator = CollationFactory.fetchCollation(collationId).collator; return new StringSearch(pattern, target, (RuleBasedCollator) collator); }` to `CollationFactory` this way, we can get the `stringSearch` and `setString` instances here, without extra String allocations -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565515634 ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java: ## @@ -169,6 +239,46 @@ private static boolean matchAt(final UTF8String target, final UTF8String pattern pos, pos + pattern.numChars()), pattern, collationId).last() == 0; } +private static int findInSet(UTF8String match, UTF8String set, int collationId) { Review Comment: these parameters should be `final` ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java: ## @@ -169,6 +239,46 @@ private static boolean matchAt(final UTF8String target, final UTF8String pattern pos, pos + pattern.numChars()), pattern, collationId).last() == 0; } +private static int findInSet(UTF8String match, UTF8String set, int collationId) { + if (match.contains(UTF8String.fromString(","))) { +return 0; + } + + StringSearch stringSearch = CollationFactory.getStringSearch(set, match, collationId); + + String setString = set.toString(); + int wordStart = 0; + while ((wordStart = stringSearch.next()) != StringSearch.DONE) { +boolean isValidStart = wordStart == 0 || setString.charAt(wordStart - 1) == ','; +boolean isValidEnd = wordStart + stringSearch.getMatchLength() == setString.length() +|| setString.charAt(wordStart + stringSearch.getMatchLength()) == ','; + +if (isValidStart && isValidEnd) { + int pos = 0; + for (int i = 0; i < setString.length() && i < wordStart; i++) { +if (setString.charAt(i) == ',') { + pos++; +} + } + + return pos + 1; +} + } + + return 0; +} + +private static int indexOf(UTF8String target, UTF8String pattern, int start, int collationId) { Review Comment: these parameters should be `final` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565510703 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala: ## @@ -974,15 +974,19 @@ case class StringTranslate(srcExpr: Expression, matchingExpr: Expression, replac case class FindInSet(left: Expression, right: Expression) extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType) + final lazy val collationId: Int = left.dataType.asInstanceOf[StringType].collationId + + override def inputTypes: Seq[AbstractDataType] = +Seq(StringTypeAnyCollation, StringTypeAnyCollation) - override protected def nullSafeEval(word: Any, set: Any): Any = -set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String]) + override protected def nullSafeEval(word: Any, set: Any): Any = { +CollationSupport.FindInSet. + exec(word.asInstanceOf[UTF8String], set.asInstanceOf[UTF8String], collationId) + } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -nullSafeCodeGen(ctx, ev, (word, set) => - s"${ev.value} = $set.findInSet($word);" -) +nullSafeCodeGen(ctx, ev, (l, r) => Review Comment: perhaps we should use "word" & "set" (instead of "l" & "r") to preserve the original naming -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565508312 ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java: ## @@ -137,6 +137,76 @@ public static boolean execICU(final UTF8String l, final UTF8String r, } } + public static class FindInSet { +public static int exec(final UTF8String l, final UTF8String r, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { +return execBinary(l, r); + } else if (collation.supportsLowercaseEquality) { +return execLowercase(l, r); + } else { +return execICU(l, r, collationId); + } +} +public static String genCode(final String l, final String r, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.FindInSet.exec"; + if (collation.supportsBinaryEquality) { +return String.format(expr + "Binary(%s, %s)", l, r); + } else if (collation.supportsLowercaseEquality) { +return String.format(expr + "Lowercase(%s, %s)", l, r); + } else { +return String.format(expr + "ICU(%s, %s, %d)", l, r, collationId); + } +} +public static int execBinary(final UTF8String l, final UTF8String r) { + return r.findInSet(l); +} +public static int execLowercase(final UTF8String l, final UTF8String r) { + return r.toLowerCase().findInSet(l.toLowerCase()); +} +public static int execICU(final UTF8String l, final UTF8String r, + final int collationId) { + return CollationAwareUTF8String.findInSet(l, r, collationId); +} + } + + public static class IndexOf { Review Comment: `CollationAwareUTF8String.indexOf` should follow the `UTF8String.indexOf naming`, so that part is fine for example but `CollationSupport.StringInstr` should definitely follow `stringExpressions.StringInstr` naming -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565502312 ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java: ## @@ -137,6 +137,76 @@ public static boolean execICU(final UTF8String l, final UTF8String r, } } + public static class FindInSet { +public static int exec(final UTF8String l, final UTF8String r, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { +return execBinary(l, r); + } else if (collation.supportsLowercaseEquality) { +return execLowercase(l, r); + } else { +return execICU(l, r, collationId); + } +} +public static String genCode(final String l, final String r, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.FindInSet.exec"; + if (collation.supportsBinaryEquality) { +return String.format(expr + "Binary(%s, %s)", l, r); + } else if (collation.supportsLowercaseEquality) { +return String.format(expr + "Lowercase(%s, %s)", l, r); + } else { +return String.format(expr + "ICU(%s, %s, %d)", l, r, collationId); + } +} +public static int execBinary(final UTF8String l, final UTF8String r) { + return r.findInSet(l); +} +public static int execLowercase(final UTF8String l, final UTF8String r) { + return r.toLowerCase().findInSet(l.toLowerCase()); +} +public static int execICU(final UTF8String l, final UTF8String r, + final int collationId) { + return CollationAwareUTF8String.findInSet(l, r, collationId); +} + } + + public static class IndexOf { Review Comment: let's keep the naming uniform if the original expression is `case class StringInstr`, then this should probably be: `public static class IndexOf` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565502312 ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java: ## @@ -137,6 +137,76 @@ public static boolean execICU(final UTF8String l, final UTF8String r, } } + public static class FindInSet { +public static int exec(final UTF8String l, final UTF8String r, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { +return execBinary(l, r); + } else if (collation.supportsLowercaseEquality) { +return execLowercase(l, r); + } else { +return execICU(l, r, collationId); + } +} +public static String genCode(final String l, final String r, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.FindInSet.exec"; + if (collation.supportsBinaryEquality) { +return String.format(expr + "Binary(%s, %s)", l, r); + } else if (collation.supportsLowercaseEquality) { +return String.format(expr + "Lowercase(%s, %s)", l, r); + } else { +return String.format(expr + "ICU(%s, %s, %d)", l, r, collationId); + } +} +public static int execBinary(final UTF8String l, final UTF8String r) { + return r.findInSet(l); +} +public static int execLowercase(final UTF8String l, final UTF8String r) { + return r.toLowerCase().findInSet(l.toLowerCase()); +} +public static int execICU(final UTF8String l, final UTF8String r, + final int collationId) { + return CollationAwareUTF8String.findInSet(l, r, collationId); +} + } + + public static class IndexOf { Review Comment: let's keep the naming uniform if the original expression is `case class StringInstr`, then this should probably be: `public static class IndexOf` ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java: ## @@ -137,6 +137,76 @@ public static boolean execICU(final UTF8String l, final UTF8String r, } } + public static class FindInSet { +public static int exec(final UTF8String l, final UTF8String r, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { +return execBinary(l, r); + } else if (collation.supportsLowercaseEquality) { +return execLowercase(l, r); + } else { +return execICU(l, r, collationId); + } +} +public static String genCode(final String l, final String r, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.FindInSet.exec"; + if (collation.supportsBinaryEquality) { +return String.format(expr + "Binary(%s, %s)", l, r); + } else if (collation.supportsLowercaseEquality) { +return String.format(expr + "Lowercase(%s, %s)", l, r); + } else { +return String.format(expr + "ICU(%s, %s, %d)", l, r, collationId); + } +} +public static int execBinary(final UTF8String l, final UTF8String r) { + return r.findInSet(l); +} +public static int execLowercase(final UTF8String l, final UTF8String r) { + return r.toLowerCase().findInSet(l.toLowerCase()); +} +public static int execICU(final UTF8String l, final UTF8String r, + final int collationId) { + return CollationAwareUTF8String.findInSet(l, r, collationId); +} + } + + public static class IndexOf { Review Comment: let's keep the naming uniform if the original expression is `case class StringInstr`, then this should probably be: `public static class StringInstr` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565494137 ## sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala: ## @@ -96,6 +95,107 @@ class CollationStringExpressionsSuite assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") } + test("INSTR check result on explicitly collated strings") { Review Comment: Let's try to unify test naming e.g. `test("Support StringInstr string expression with collation")`if the test is in CollationStringExpressionsSuite.scala ## sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala: ## @@ -96,6 +95,107 @@ class CollationStringExpressionsSuite assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") } + test("INSTR check result on explicitly collated strings") { +def testInStr(str: String, substr: String, collationId: Integer, expected: Integer): Unit = { + val string = Literal.create(str, StringType(collationId)) + val substring = Literal.create(substr, StringType(collationId)) + + checkEvaluation(StringInstr(string, substring), expected) +} + +var collationId = CollationFactory.collationNameToId("UTF8_BINARY") +testInStr("aaads", "Aa", collationId, 0) +testInStr("aaaDs", "de", collationId, 0) +testInStr("aaads", "ds", collationId, 4) +testInStr("", "", collationId, 1) +testInStr("", "", collationId, 0) +// scalastyle:off +testInStr("test大千世界X大千世界", "大千", collationId, 5) +testInStr("test大千世界X大千世界", "界X", collationId, 8) +// scalastyle:on + +collationId = CollationFactory.collationNameToId("UTF8_BINARY_LCASE") +testInStr("aaads", "Aa", collationId, 1) +testInStr("aaaDs", "de", collationId, 0) +testInStr("aaaDs", "ds", collationId, 4) +testInStr("", "", collationId, 1) +testInStr("", "", collationId, 0) +// scalastyle:off +testInStr("test大千世界X大千世界", "大千", collationId, 5) +testInStr("test大千世界X大千世界", "界x", collationId, 8) +// scalastyle:on + +collationId = CollationFactory.collationNameToId("UNICODE") +testInStr("aaads", "Aa", collationId, 0) +testInStr("aaads", "aa", collationId, 1) +testInStr("aaads", "de", collationId, 0) +testInStr("", "", collationId, 1) +testInStr("", "", collationId, 0) +// scalastyle:off +testInStr("test大千世界X大千世界", "界x", collationId, 0) +testInStr("test大千世界X大千世界", "界X", collationId, 8) +// scalastyle:on + +collationId = CollationFactory.collationNameToId("UNICODE_CI") +testInStr("aaads", "AD", collationId, 3) +testInStr("aaads", "dS", collationId, 4) +// scalastyle:off +testInStr("test大千世界X大千世界", "界x", collationId, 8) +// scalastyle:on + } + + test("FIND_IN_SET check result on explicitly collated strings") { Review Comment: Let's try to unify test naming e.g. `test("Support FindInSet string expression with collation")` if the test is in CollationStringExpressionsSuite.scala -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565494692 ## sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala: ## @@ -96,6 +95,107 @@ class CollationStringExpressionsSuite assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") } + test("INSTR check result on explicitly collated strings") { +def testInStr(str: String, substr: String, collationId: Integer, expected: Integer): Unit = { + val string = Literal.create(str, StringType(collationId)) + val substring = Literal.create(substr, StringType(collationId)) + + checkEvaluation(StringInstr(string, substring), expected) +} + +var collationId = CollationFactory.collationNameToId("UTF8_BINARY") +testInStr("aaads", "Aa", collationId, 0) +testInStr("aaaDs", "de", collationId, 0) +testInStr("aaads", "ds", collationId, 4) +testInStr("", "", collationId, 1) +testInStr("", "", collationId, 0) +// scalastyle:off +testInStr("test大千世界X大千世界", "大千", collationId, 5) +testInStr("test大千世界X大千世界", "界X", collationId, 8) +// scalastyle:on + +collationId = CollationFactory.collationNameToId("UTF8_BINARY_LCASE") +testInStr("aaads", "Aa", collationId, 1) +testInStr("aaaDs", "de", collationId, 0) +testInStr("aaaDs", "ds", collationId, 4) +testInStr("", "", collationId, 1) +testInStr("", "", collationId, 0) +// scalastyle:off +testInStr("test大千世界X大千世界", "大千", collationId, 5) +testInStr("test大千世界X大千世界", "界x", collationId, 8) +// scalastyle:on + +collationId = CollationFactory.collationNameToId("UNICODE") +testInStr("aaads", "Aa", collationId, 0) +testInStr("aaads", "aa", collationId, 1) +testInStr("aaads", "de", collationId, 0) +testInStr("", "", collationId, 1) +testInStr("", "", collationId, 0) +// scalastyle:off +testInStr("test大千世界X大千世界", "界x", collationId, 0) +testInStr("test大千世界X大千世界", "界X", collationId, 8) +// scalastyle:on + +collationId = CollationFactory.collationNameToId("UNICODE_CI") +testInStr("aaads", "AD", collationId, 3) +testInStr("aaads", "dS", collationId, 4) +// scalastyle:off +testInStr("test大千世界X大千世界", "界x", collationId, 8) +// scalastyle:on + } + + test("FIND_IN_SET check result on explicitly collated strings") { Review Comment: Let's try to unify test naming e.g. `test("Support FindInSet string expression with collation")` in CollationStringExpressionsSuite.scala -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565494692 ## sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala: ## @@ -96,6 +95,107 @@ class CollationStringExpressionsSuite assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") } + test("INSTR check result on explicitly collated strings") { +def testInStr(str: String, substr: String, collationId: Integer, expected: Integer): Unit = { + val string = Literal.create(str, StringType(collationId)) + val substring = Literal.create(substr, StringType(collationId)) + + checkEvaluation(StringInstr(string, substring), expected) +} + +var collationId = CollationFactory.collationNameToId("UTF8_BINARY") +testInStr("aaads", "Aa", collationId, 0) +testInStr("aaaDs", "de", collationId, 0) +testInStr("aaads", "ds", collationId, 4) +testInStr("", "", collationId, 1) +testInStr("", "", collationId, 0) +// scalastyle:off +testInStr("test大千世界X大千世界", "大千", collationId, 5) +testInStr("test大千世界X大千世界", "界X", collationId, 8) +// scalastyle:on + +collationId = CollationFactory.collationNameToId("UTF8_BINARY_LCASE") +testInStr("aaads", "Aa", collationId, 1) +testInStr("aaaDs", "de", collationId, 0) +testInStr("aaaDs", "ds", collationId, 4) +testInStr("", "", collationId, 1) +testInStr("", "", collationId, 0) +// scalastyle:off +testInStr("test大千世界X大千世界", "大千", collationId, 5) +testInStr("test大千世界X大千世界", "界x", collationId, 8) +// scalastyle:on + +collationId = CollationFactory.collationNameToId("UNICODE") +testInStr("aaads", "Aa", collationId, 0) +testInStr("aaads", "aa", collationId, 1) +testInStr("aaads", "de", collationId, 0) +testInStr("", "", collationId, 1) +testInStr("", "", collationId, 0) +// scalastyle:off +testInStr("test大千世界X大千世界", "界x", collationId, 0) +testInStr("test大千世界X大千世界", "界X", collationId, 8) +// scalastyle:on + +collationId = CollationFactory.collationNameToId("UNICODE_CI") +testInStr("aaads", "AD", collationId, 3) +testInStr("aaads", "dS", collationId, 4) +// scalastyle:off +testInStr("test大千世界X大千世界", "界x", collationId, 8) +// scalastyle:on + } + + test("FIND_IN_SET check result on explicitly collated strings") { Review Comment: Let's try to unify test naming e.g. `test("Support StringInstr string expression with collation")` in CollationStringExpressionsSuite.scala -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565492245 ## sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala: ## @@ -96,6 +95,107 @@ class CollationStringExpressionsSuite assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") } + test("INSTR check result on explicitly collated strings") { +def testInStr(str: String, substr: String, collationId: Integer, expected: Integer): Unit = { + val string = Literal.create(str, StringType(collationId)) + val substring = Literal.create(substr, StringType(collationId)) + + checkEvaluation(StringInstr(string, substring), expected) +} + +var collationId = CollationFactory.collationNameToId("UTF8_BINARY") +testInStr("aaads", "Aa", collationId, 0) +testInStr("aaaDs", "de", collationId, 0) +testInStr("aaads", "ds", collationId, 4) +testInStr("", "", collationId, 1) +testInStr("", "", collationId, 0) +// scalastyle:off +testInStr("test大千世界X大千世界", "大千", collationId, 5) +testInStr("test大千世界X大千世界", "界X", collationId, 8) +// scalastyle:on + +collationId = CollationFactory.collationNameToId("UTF8_BINARY_LCASE") +testInStr("aaads", "Aa", collationId, 1) +testInStr("aaaDs", "de", collationId, 0) +testInStr("aaaDs", "ds", collationId, 4) +testInStr("", "", collationId, 1) +testInStr("", "", collationId, 0) +// scalastyle:off +testInStr("test大千世界X大千世界", "大千", collationId, 5) +testInStr("test大千世界X大千世界", "界x", collationId, 8) +// scalastyle:on + +collationId = CollationFactory.collationNameToId("UNICODE") +testInStr("aaads", "Aa", collationId, 0) +testInStr("aaads", "aa", collationId, 1) +testInStr("aaads", "de", collationId, 0) +testInStr("", "", collationId, 1) +testInStr("", "", collationId, 0) +// scalastyle:off +testInStr("test大千世界X大千世界", "界x", collationId, 0) +testInStr("test大千世界X大千世界", "界X", collationId, 8) +// scalastyle:on + +collationId = CollationFactory.collationNameToId("UNICODE_CI") +testInStr("aaads", "AD", collationId, 3) +testInStr("aaads", "dS", collationId, 4) +// scalastyle:off +testInStr("test大千世界X大千世界", "界x", collationId, 8) +// scalastyle:on Review Comment: unit tests should be moved to CollationSupportSuite.java, following the appropriate format please adjust your tests also, add appropriate sql tests to CollationStringExpressionsSuite.scala ## sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala: ## @@ -96,6 +95,107 @@ class CollationStringExpressionsSuite assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") } + test("INSTR check result on explicitly collated strings") { +def testInStr(str: String, substr: String, collationId: Integer, expected: Integer): Unit = { + val string = Literal.create(str, StringType(collationId)) + val substring = Literal.create(substr, StringType(collationId)) + + checkEvaluation(StringInstr(string, substring), expected) +} + +var collationId = CollationFactory.collationNameToId("UTF8_BINARY") +testInStr("aaads", "Aa", collationId, 0) +testInStr("aaaDs", "de", collationId, 0) +testInStr("aaads", "ds", collationId, 4) +testInStr("", "", collationId, 1) +testInStr("", "", collationId, 0) +// scalastyle:off +testInStr("test大千世界X大千世界", "大千", collationId, 5) +testInStr("test大千世界X大千世界", "界X", collationId, 8) +// scalastyle:on + +collationId = CollationFactory.collationNameToId("UTF8_BINARY_LCASE") +testInStr("aaads", "Aa", collationId, 1) +testInStr("aaaDs", "de", collationId, 0) +testInStr("aaaDs", "ds", collationId, 4) +testInStr("", "", collationId, 1) +testInStr("", "", collationId, 0) +// scalastyle:off +testInStr("test大千世界X大千世界", "大千", collationId, 5) +testInStr("test大千世界X大千世界", "界x", collationId, 8) +// scalastyle:on + +collationId = CollationFactory.collationNameToId("UNICODE") +testInStr("aaads", "Aa", collationId, 0) +testInStr("aaads", "aa", collationId, 1) +testInStr("aaads", "de", collationId, 0) +testInStr("", "", collationId, 1) +testInStr("", "", collationId, 0) +// scalastyle:off +testInStr("test大千世界X大千世界", "界x", collationId, 0) +testInStr("test大千世界X大千世界", "界X", collationId, 8) +// scalastyle:on + +collationId = CollationFactory.collationNameToId("UNICODE_CI") +testInStr("aaads", "AD", collationId, 3) +testInStr("aaads", "dS", collationId, 4) +// scalastyle:off +testInStr("test大千世界X大千世界", "界x", collationId, 8) +// scalastyle:on + } + + test("FIND_IN_SET check result on explicitly collated strings") { +def testFindInSet(word: String, set: String, collationId: Integer, expected: Integer): Unit = { + val
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565490452 ## sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala: ## @@ -96,6 +95,107 @@ class CollationStringExpressionsSuite assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") } + test("INSTR check result on explicitly collated strings") { +def testInStr(str: String, substr: String, collationId: Integer, expected: Integer): Unit = { + val string = Literal.create(str, StringType(collationId)) + val substring = Literal.create(substr, StringType(collationId)) + + checkEvaluation(StringInstr(string, substring), expected) +} + +var collationId = CollationFactory.collationNameToId("UTF8_BINARY") +testInStr("aaads", "Aa", collationId, 0) +testInStr("aaaDs", "de", collationId, 0) +testInStr("aaads", "ds", collationId, 4) +testInStr("", "", collationId, 1) +testInStr("", "", collationId, 0) +// scalastyle:off +testInStr("test大千世界X大千世界", "大千", collationId, 5) +testInStr("test大千世界X大千世界", "界X", collationId, 8) +// scalastyle:on + +collationId = CollationFactory.collationNameToId("UTF8_BINARY_LCASE") +testInStr("aaads", "Aa", collationId, 1) +testInStr("aaaDs", "de", collationId, 0) +testInStr("aaaDs", "ds", collationId, 4) +testInStr("", "", collationId, 1) +testInStr("", "", collationId, 0) +// scalastyle:off +testInStr("test大千世界X大千世界", "大千", collationId, 5) +testInStr("test大千世界X大千世界", "界x", collationId, 8) +// scalastyle:on + +collationId = CollationFactory.collationNameToId("UNICODE") +testInStr("aaads", "Aa", collationId, 0) +testInStr("aaads", "aa", collationId, 1) +testInStr("aaads", "de", collationId, 0) +testInStr("", "", collationId, 1) +testInStr("", "", collationId, 0) +// scalastyle:off +testInStr("test大千世界X大千世界", "界x", collationId, 0) +testInStr("test大千世界X大千世界", "界X", collationId, 8) +// scalastyle:on + +collationId = CollationFactory.collationNameToId("UNICODE_CI") +testInStr("aaads", "AD", collationId, 3) +testInStr("aaads", "dS", collationId, 4) +// scalastyle:off +testInStr("test大千世界X大千世界", "界x", collationId, 8) +// scalastyle:on + } + + test("FIND_IN_SET check result on explicitly collated strings") { +def testFindInSet(word: String, set: String, collationId: Integer, expected: Integer): Unit = { + val w = Literal.create(word, StringType(collationId)) + val s = Literal.create(set, StringType(collationId)) + + checkEvaluation(FindInSet(w, s), expected) +} Review Comment: we shouldn't use unit tests like this, please take a look at the other tests in their appropriate suites: - CollationSupportSuite.java for unit tests related to newly introduced collation-aware support in CollationSupport.java - CollationStringExpressionsSuite.scala for e2e SQL tests that should be used in limited quantity read more in the refactor Jira ticket description notes: https://issues.apache.org/jira/browse/SPARK-47410 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565489630 ## sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala: ## @@ -96,6 +95,107 @@ class CollationStringExpressionsSuite assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") } + test("INSTR check result on explicitly collated strings") { +def testInStr(str: String, substr: String, collationId: Integer, expected: Integer): Unit = { + val string = Literal.create(str, StringType(collationId)) + val substring = Literal.create(substr, StringType(collationId)) + + checkEvaluation(StringInstr(string, substring), expected) +} Review Comment: we shouldn't use unit tests like this, please take a look at the other tests in their appropriate suites: - CollationSupportSuite.java for unit tests related to newly introduced collation-aware support in CollationSupport.java - CollationStringExpressionsSuite.scala for e2e SQL tests that should be used in limited quantity read more in the refactor Jira ticket description notes: https://issues.apache.org/jira/browse/SPARK-47410 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565484403 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala: ## @@ -1346,20 +1350,24 @@ case class StringTrimRight(srcStr: Expression, trimStr: Option[Expression] = Non case class StringInstr(str: Expression, substr: Expression) extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { + final lazy val collationId: Int = left.dataType.asInstanceOf[StringType].collationId + override def left: Expression = str override def right: Expression = substr override def dataType: DataType = IntegerType - override def inputTypes: Seq[DataType] = Seq(StringType, StringType) + override def inputTypes: Seq[AbstractDataType] = +Seq(StringTypeAnyCollation, StringTypeAnyCollation) override def nullSafeEval(string: Any, sub: Any): Any = { -string.asInstanceOf[UTF8String].indexOf(sub.asInstanceOf[UTF8String], 0) + 1 +CollationSupport.IndexOf. + exec(string.asInstanceOf[UTF8String], sub.asInstanceOf[UTF8String], 0, collationId) + 1 } override def prettyName: String = "instr" override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -defineCodeGen(ctx, ev, (l, r) => - s"($l).indexOf($r, 0) + 1") + defineCodeGen(ctx, ev, (l, r) => +CollationSupport.IndexOf.genCode(l, r, 0, collationId) + " + 1") Review Comment: perhaps we should use "string" & "sub" (instead of "l" & "r") to preserve the original naming -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1565482699 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala: ## @@ -974,15 +974,19 @@ case class StringTranslate(srcExpr: Expression, matchingExpr: Expression, replac case class FindInSet(left: Expression, right: Expression) extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType) + final lazy val collationId: Int = left.dataType.asInstanceOf[StringType].collationId + + override def inputTypes: Seq[AbstractDataType] = +Seq(StringTypeAnyCollation, StringTypeAnyCollation) - override protected def nullSafeEval(word: Any, set: Any): Any = -set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String]) + override protected def nullSafeEval(word: Any, set: Any): Any = { +CollationSupport.FindInSet. + exec(word.asInstanceOf[UTF8String], set.asInstanceOf[UTF8String], collationId) + } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -nullSafeCodeGen(ctx, ev, (word, set) => - s"${ev.value} = $set.findInSet($word);" -) +nullSafeCodeGen(ctx, ev, (l, r) => + s"${ev.value} = " + CollationSupport.FindInSet.genCode(l, r, collationId) + ";") Review Comment: Did you try just using `defineCodeGen`? `defineCodeGen(ctx, ev, (word, set) => CollationSupport.FindInSet.genCode(word, set, collationId))` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on PR #45643: URL: https://github.com/apache/spark/pull/45643#issuecomment-2049812695 heads up: we’ve done some major code restructuring in https://github.com/apache/spark/pull/45978, so please sync these changes before moving on @miland-db you’ll likely need to rewrite the code in this PR, so please follow the guidelines outlined in https://issues.apache.org/jira/browse/SPARK-47410 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
dbatomic commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1555475151 ## common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java: ## @@ -549,6 +549,51 @@ public int findInSet(UTF8String match) { return 0; } + public int findInSet(UTF8String match, int collationId) { +if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) { + return this.findInSet(match); +} +if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) { Review Comment: @cloud-fan - Let's put on hold string expression development until we provide proper design. @uros-db, @miland-db and I will follow up on this today. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
cloud-fan commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1555443962 ## common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java: ## @@ -549,6 +549,51 @@ public int findInSet(UTF8String match) { return 0; } + public int findInSet(UTF8String match, int collationId) { +if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) { + return this.findInSet(match); +} +if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) { Review Comment: "lowercase collator" SGTM -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
miland-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1555394471 ## common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java: ## @@ -549,6 +549,51 @@ public int findInSet(UTF8String match) { return 0; } + public int findInSet(UTF8String match, int collationId) { +if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) { + return this.findInSet(match); +} +if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) { Review Comment: We can solve this in the same way we did i t here: [STRING_LOCATE](https://github.com/apache/spark/pull/45791). In the next PRs we updated `CollationFactory` to be able to return "lowercase collator" so we can unify the way we deal with collated strings. If you prefer, I can update this PR with that change. ## common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java: ## @@ -549,6 +549,51 @@ public int findInSet(UTF8String match) { return 0; } + public int findInSet(UTF8String match, int collationId) { +if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) { + return this.findInSet(match); +} +if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) { Review Comment: We can solve this in the same way we did it here: [STRING_LOCATE](https://github.com/apache/spark/pull/45791). In the next PRs we updated `CollationFactory` to be able to return "lowercase collator" so we can unify the way we deal with collated strings. If you prefer, I can update this PR with that change. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
dbatomic commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1555389826 ## common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java: ## @@ -549,6 +549,51 @@ public int findInSet(UTF8String match) { return 0; } + public int findInSet(UTF8String match, int collationId) { +if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) { + return this.findInSet(match); +} +if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) { Review Comment: That being said, I also don't like current direction of pushing everything into `UTF8String`. Let me see if we can come up with some cleaner approach. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
dbatomic commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1555378573 ## common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java: ## @@ -549,6 +549,51 @@ public int findInSet(UTF8String match) { return 0; } + public int findInSet(UTF8String match, int collationId) { +if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) { + return this.findInSet(match); +} +if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) { Review Comment: We don't have to expect more special collations besides `UTF8_BINARY_LCASE`. Everything else will be based on ICU. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
cloud-fan commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1555173817 ## common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java: ## @@ -549,6 +549,51 @@ public int findInSet(UTF8String match) { return 0; } + public int findInSet(UTF8String match, int collationId) { +if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) { + return this.findInSet(match); +} +if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) { Review Comment: @dbatomic do we have a general principle for this special collation? Always lower-case first and then reuse existing UTF8String functions? Will we have more collations like this in the future? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
miland-db commented on PR #45643: URL: https://github.com/apache/spark/pull/45643#issuecomment-2039386033 @MaxGekk can you please review this? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
miland-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1549579283 ## common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java: ## @@ -549,6 +549,45 @@ public int findInSet(UTF8String match) { return 0; } + public int findInSet(UTF8String match, int collationId) { +if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) { + return this.findInSet(match); +} +if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) { + return this.toLowerCase().findInSet(match.toLowerCase()); +} +return collationAwareFindInSet(match, collationId); +} + + private int collationAwareFindInSet(UTF8String match, int collationId) { +if (match.contains(COMMA_UTF8)) { + return 0; +} + +StringSearch stringSearch = CollationFactory.getStringSearch(this, match, collationId); + +String setString = this.toString(); +int wordStart = 0; +while ((wordStart = stringSearch.next()) != StringSearch.DONE) { + boolean isValidStart = wordStart == 0 || setString.charAt(wordStart - 1) == ','; + boolean isValidEnd = wordStart + stringSearch.getMatchLength() == setString.length() + || setString.charAt(wordStart + stringSearch.getMatchLength()) == ','; + + if (isValidStart && isValidEnd) { +int pos = 0; +for (int i = 0; i < setString.length() && i < wordStart; i++) { + if (setString.charAt(i) == ',') { +pos++; + } +} + +return pos + 1; Review Comment: I think it is specific for this case because we don't need methods like this in other places: `private int countBefore(UTF8String pattern, int collationId, int end)` `private int countAfter(UTF8String pattern, int collationId, int start)` Maybe if we encounter such a case, we can adapt code then. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
miland-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1551227337 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala: ## @@ -994,15 +994,29 @@ case class StringTranslate(srcExpr: Expression, matchingExpr: Expression, replac case class FindInSet(left: Expression, right: Expression) extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType) + final lazy val collationId: Int = left.dataType.asInstanceOf[StringType].collationId + + override def inputTypes: Seq[AbstractDataType] = +Seq(StringTypeAnyCollation, StringTypeAnyCollation) - override protected def nullSafeEval(word: Any, set: Any): Any = -set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String]) + override protected def nullSafeEval(word: Any, set: Any): Any = { +if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) { + set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String]) +} else { + set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String], collationId) +} + } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -nullSafeCodeGen(ctx, ev, (word, set) => - s"${ev.value} = $set.findInSet($word);" -) +if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) { + nullSafeCodeGen(ctx, ev, (word, set) => s"${ev.value} = $set.findInSet($word);") +} else { + nullSafeCodeGen(ctx, ev, (word, set) => s"${ev.value} = $set.findInSet($word, $collationId);") +} + } + + override def checkInputDataTypes(): TypeCheckResult = { +super.checkInputDataTypes() } Review Comment: Ok, removed! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1551195271 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala: ## @@ -994,15 +994,29 @@ case class StringTranslate(srcExpr: Expression, matchingExpr: Expression, replac case class FindInSet(left: Expression, right: Expression) extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType) + final lazy val collationId: Int = left.dataType.asInstanceOf[StringType].collationId + + override def inputTypes: Seq[AbstractDataType] = +Seq(StringTypeAnyCollation, StringTypeAnyCollation) - override protected def nullSafeEval(word: Any, set: Any): Any = -set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String]) + override protected def nullSafeEval(word: Any, set: Any): Any = { +if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) { + set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String]) +} else { + set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String], collationId) +} + } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -nullSafeCodeGen(ctx, ev, (word, set) => - s"${ev.value} = $set.findInSet($word);" -) +if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) { + nullSafeCodeGen(ctx, ev, (word, set) => s"${ev.value} = $set.findInSet($word);") +} else { + nullSafeCodeGen(ctx, ev, (word, set) => s"${ev.value} = $set.findInSet($word, $collationId);") +} + } + + override def checkInputDataTypes(): TypeCheckResult = { +super.checkInputDataTypes() } Review Comment: I don't think this is needed anymore (& elsewhere too) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
miland-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1551132518 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala: ## @@ -1002,15 +1002,34 @@ case class StringTranslate(srcExpr: Expression, matchingExpr: Expression, replac case class FindInSet(left: Expression, right: Expression) extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType) + final lazy val collationId: Int = left.dataType.asInstanceOf[StringType].collationId - override protected def nullSafeEval(word: Any, set: Any): Any = -set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String]) + override def inputTypes: Seq[AbstractDataType] = +Seq(StringTypeAnyCollation, StringTypeAnyCollation) + + override protected def nullSafeEval(word: Any, set: Any): Any = { +if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) { + set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String]) +} else { + set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String], collationId) +} + } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -nullSafeCodeGen(ctx, ev, (word, set) => - s"${ev.value} = $set.findInSet($word);" -) +if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) { + nullSafeCodeGen(ctx, ev, (word, set) => s"${ev.value} = $set.findInSet($word);") +} else { + nullSafeCodeGen(ctx, ev, (word, set) => s"${ev.value} = $set.findInSet($word, $collationId);") +} + } + + override def checkInputDataTypes(): TypeCheckResult = { +val defaultCheck = super.checkInputDataTypes() +if (defaultCheck.isFailure) { + return defaultCheck +} + +CollationTypeConstraints.checkCollationCompatibility(collationId, children.map(_.dataType)) Review Comment: Sure, will do -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1550968894 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala: ## @@ -1002,15 +1002,34 @@ case class StringTranslate(srcExpr: Expression, matchingExpr: Expression, replac case class FindInSet(left: Expression, right: Expression) extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType) + final lazy val collationId: Int = left.dataType.asInstanceOf[StringType].collationId - override protected def nullSafeEval(word: Any, set: Any): Any = -set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String]) + override def inputTypes: Seq[AbstractDataType] = +Seq(StringTypeAnyCollation, StringTypeAnyCollation) + + override protected def nullSafeEval(word: Any, set: Any): Any = { +if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) { + set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String]) +} else { + set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String], collationId) +} + } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -nullSafeCodeGen(ctx, ev, (word, set) => - s"${ev.value} = $set.findInSet($word);" -) +if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) { + nullSafeCodeGen(ctx, ev, (word, set) => s"${ev.value} = $set.findInSet($word);") +} else { + nullSafeCodeGen(ctx, ev, (word, set) => s"${ev.value} = $set.findInSet($word, $collationId);") +} + } + + override def checkInputDataTypes(): TypeCheckResult = { +val defaultCheck = super.checkInputDataTypes() +if (defaultCheck.isFailure) { + return defaultCheck +} + +CollationTypeConstraints.checkCollationCompatibility(collationId, children.map(_.dataType)) Review Comment: heads up: [implicit cast](https://github.com/apache/spark/pull/45383) is now live please sync your fork and rebase your changes accordingly -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
miland-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1549579283 ## common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java: ## @@ -549,6 +549,45 @@ public int findInSet(UTF8String match) { return 0; } + public int findInSet(UTF8String match, int collationId) { +if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) { + return this.findInSet(match); +} +if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) { + return this.toLowerCase().findInSet(match.toLowerCase()); +} +return collationAwareFindInSet(match, collationId); +} + + private int collationAwareFindInSet(UTF8String match, int collationId) { +if (match.contains(COMMA_UTF8)) { + return 0; +} + +StringSearch stringSearch = CollationFactory.getStringSearch(this, match, collationId); + +String setString = this.toString(); +int wordStart = 0; +while ((wordStart = stringSearch.next()) != StringSearch.DONE) { + boolean isValidStart = wordStart == 0 || setString.charAt(wordStart - 1) == ','; + boolean isValidEnd = wordStart + stringSearch.getMatchLength() == setString.length() + || setString.charAt(wordStart + stringSearch.getMatchLength()) == ','; + + if (isValidStart && isValidEnd) { +int pos = 0; +for (int i = 0; i < setString.length() && i < wordStart; i++) { + if (setString.charAt(i) == ',') { +pos++; + } +} + +return pos + 1; Review Comment: I think it is specific for this case because we don't need methods like in other places: `private int countBefore(UTF8String pattern, int collationId, int end)` `private int countAfter(UTF8String pattern, int collationId, int start)` Maybe if we encounter such a case, we can adapt code then. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1549419980 ## common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java: ## @@ -549,6 +549,45 @@ public int findInSet(UTF8String match) { return 0; } + public int findInSet(UTF8String match, int collationId) { Review Comment: I think we now have a lot of new methods consider adding comments such as: https://github.com/apache/spark/pull/45791/files#r1549415554 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1549393104 ## sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala: ## @@ -73,17 +74,78 @@ class CollationStringExpressionsSuite extends QueryTest }) } + test("INSTR check result on explicitly collated strings") { Review Comment: try covering some more edge cases, such as: empty strings, uppercase and lowercase mix, different byte-length chars, etc. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1549393104 ## sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala: ## @@ -73,17 +74,78 @@ class CollationStringExpressionsSuite extends QueryTest }) } + test("INSTR check result on explicitly collated strings") { Review Comment: (goes for all tests) try covering some more edge cases, such as: empty strings, uppercase and lowercase mix, different byte-length chars, etc. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1549387717 ## sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala: ## @@ -73,17 +74,78 @@ class CollationStringExpressionsSuite extends QueryTest }) } + test("INSTR check result on explicitly collated strings") { +def testInStr(str: String, substr: String, collationId: Integer, expected: Integer): Unit = { + val string = Literal.create(str, StringType(collationId)) + val substring = Literal.create(substr, StringType(collationId)) + + checkEvaluation(StringInstr(string, substring), expected) +} + +var collationId = CollationFactory.collationNameToId("UTF8_BINARY_LCASE") Review Comment: since you have some "UTF8_BINARY" tests down below, consider adding a couple here as well -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1549378473 ## common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java: ## @@ -549,6 +549,45 @@ public int findInSet(UTF8String match) { return 0; } + public int findInSet(UTF8String match, int collationId) { +if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) { + return this.findInSet(match); +} +if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) { + return this.toLowerCase().findInSet(match.toLowerCase()); +} +return collationAwareFindInSet(match, collationId); +} + + private int collationAwareFindInSet(UTF8String match, int collationId) { +if (match.contains(COMMA_UTF8)) { + return 0; +} + +StringSearch stringSearch = CollationFactory.getStringSearch(this, match, collationId); + +String setString = this.toString(); +int wordStart = 0; +while ((wordStart = stringSearch.next()) != StringSearch.DONE) { + boolean isValidStart = wordStart == 0 || setString.charAt(wordStart - 1) == ','; + boolean isValidEnd = wordStart + stringSearch.getMatchLength() == setString.length() + || setString.charAt(wordStart + stringSearch.getMatchLength()) == ','; + + if (isValidStart && isValidEnd) { +int pos = 0; +for (int i = 0; i < setString.length() && i < wordStart; i++) { + if (setString.charAt(i) == ',') { +pos++; + } +} + +return pos + 1; Review Comment: if this piece of code is just counting commas, consider using a separate java function for this if such function doesn't currently exist, perhaps you could modularize this code a bit and introduce it -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-47411][SQL] Support INSTR & FIND_IN_SET functions to work with collated strings [spark]
uros-db commented on code in PR #45643: URL: https://github.com/apache/spark/pull/45643#discussion_r1549356302 ## common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java: ## @@ -549,6 +549,45 @@ public int findInSet(UTF8String match) { return 0; } + public int findInSet(UTF8String match, int collationId) { +if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) { + return this.findInSet(match); +} +if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) { + return this.toLowerCase().findInSet(match.toLowerCase()); +} +return collationAwareFindInSet(match, collationId); +} Review Comment: fix indentation -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org