nikolamand-db commented on code in PR #46180: URL: https://github.com/apache/spark/pull/46180#discussion_r1601383798
########## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java: ########## @@ -117,76 +118,490 @@ public Collation( } /** - * Constructor with comparators that are inherited from the given collator. + * collation id (32-bit integer) layout: + * bit 31: 0 = predefined collation, 1 = user-defined collation + * bit 30: 0 = utf8-binary, 1 = ICU + * bit 29: 0 = case-sensitive, 1 = case-insensitive + * bit 28: 0 = accent-sensitive, 1 = accent-insensitive + * bit 27-26: 00 = unspecified, 01 = punctuation-sensitive, 10 = punctuation-insensitive + * bit 25-24: 00 = unspecified, 01 = first-lower, 10 = first-upper + * bit 23-22: 00 = unspecified, 01 = to-lower, 10 = to-upper + * bit 21-20: 00 = unspecified, 01 = trim-left, 10 = trim-right, 11 = trim-both + * bit 19-18: zeroes, reserved for version + * bit 17-16: zeroes + * bit 15-0: locale id for ICU collations / zeroes for utf8-binary */ - public Collation( - String collationName, - Collator collator, - String version, - boolean supportsBinaryEquality, - boolean supportsBinaryOrdering, - boolean supportsLowercaseEquality) { - this( - collationName, - collator, - (s1, s2) -> collator.compare(s1.toString(), s2.toString()), - version, - s -> (long)collator.getCollationKey(s.toString()).hashCode(), - supportsBinaryEquality, - supportsBinaryOrdering, - supportsLowercaseEquality); - } - } + private static class CollationSpec { + private enum ImplementationProvider { + UTF8_BINARY, ICU + } + + private enum CaseSensitivity { + CS, CI + } + + private enum AccentSensitivity { + AS, AI + } + + private enum PunctuationSensitivity { + UNSPECIFIED, PS, PI + } - private static final Collation[] collationTable = new Collation[4]; - private static final HashMap<String, Integer> collationNameToIdMap = new HashMap<>(); - - public static final int UTF8_BINARY_COLLATION_ID = 0; - public static final int UTF8_BINARY_LCASE_COLLATION_ID = 1; - - static { - // Binary comparison. This is the default collation. - // No custom comparators will be used for this collation. - // Instead, we rely on byte for byte comparison. - collationTable[0] = new Collation( - "UTF8_BINARY", - null, - UTF8String::binaryCompare, - "1.0", - s -> (long)s.hashCode(), - true, - true, - false); - - // Case-insensitive UTF8 binary collation. - // TODO: Do in place comparisons instead of creating new strings. - collationTable[1] = new Collation( - "UTF8_BINARY_LCASE", - null, - UTF8String::compareLowerCase, - "1.0", - (s) -> (long)s.toLowerCase().hashCode(), - false, - false, - true); - - // UNICODE case sensitive comparison (ROOT locale, in ICU). - collationTable[2] = new Collation( - "UNICODE", Collator.getInstance(ULocale.ROOT), "153.120.0.0", true, false, false); - collationTable[2].collator.setStrength(Collator.TERTIARY); - collationTable[2].collator.freeze(); - - // UNICODE case-insensitive comparison (ROOT locale, in ICU + Secondary strength). - collationTable[3] = new Collation( - "UNICODE_CI", Collator.getInstance(ULocale.ROOT), "153.120.0.0", false, false, false); - collationTable[3].collator.setStrength(Collator.SECONDARY); - collationTable[3].collator.freeze(); - - for (int i = 0; i < collationTable.length; i++) { - collationNameToIdMap.put(collationTable[i].collationName, i); + private enum FirstLetterPreference { + UNSPECIFIED, FU, FL + } + + private enum CaseConversion { + UNSPECIFIED, LCASE, UCASE + } + + private enum SpaceTrimming { + UNSPECIFIED, LTRIM, RTRIM, TRIM + } + + private static final int implementationProviderOffset = 30; + private static final int implementationProviderLen = 1; + private static final int caseSensitivityOffset = 29; + private static final int caseSensitivityLen = 1; + private static final int accentSensitivityOffset = 28; + private static final int accentSensitivityLen = 1; + private static final int punctuationSensitivityOffset = 26; + private static final int punctuationSensitivityLen = 2; + private static final int firstLetterPreferenceOffset = 24; + private static final int firstLetterPreferenceLen = 2; + private static final int caseConversionOffset = 22; + private static final int caseConversionLen = 2; + private static final int spaceTrimmingOffset = 20; + private static final int spaceTrimmingLen = 2; + private static final int localeOffset = 0; + private static final int localeLen = 16; + + private static final String[] ICULocaleNames; + private static final Map<String, ULocale> ICULocaleMap = new HashMap<>(); + private static final Map<String, String> ICULocaleMapUppercase = new HashMap<>(); + private static final Map<String, Integer> ICULocaleToId = new HashMap<>(); + + static { + ICULocaleMap.put("UNICODE", ULocale.ROOT); + ULocale[] locales = Collator.getAvailableULocales(); Review Comment: Please check [comment](https://github.com/apache/spark/pull/46180/files#r1601363774) below. We assume is that list of available locales is going to be pretty stable and added golden file to verify this. However, if we bump ICU version and any changes do occur (for example, ICU adds support for new locales and the sorted list of locale names changes), users shouldn't still be affected because collation ids are internal to Spark. Still, problems could arise if there are multiple executors where one is running a Spark version prior to this breaking change and the other is running Spark version after the change though I'm not aware though of particular scenario where this could happen. But even if this does happen and golden file reports unwanted changes in the diff, we can add more logic to enforce no breaking changes to the existing locale ids are made. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org