This is an automated email from the ASF dual-hosted git repository.
stoty pushed a commit to branch 5.1
in repository https://gitbox.apache.org/repos/asf/phoenix.git
The following commit(s) were added to refs/heads/5.1 by this push:
new 35a8b52d7a PHOENIX-6818 Remove dependency on the i18n-util library
35a8b52d7a is described below
commit 35a8b52d7a86d090ef514857382359875ca2cf5b
Author: Mate Szalay-Beko <[email protected]>
AuthorDate: Thu Nov 10 16:11:49 2022 +0100
PHOENIX-6818 Remove dependency on the i18n-util library
i18n-util is not maintained anymore, but uses icu4j dependencies having CVE
issues. To avoid these problems, I copied the relevant code from
i18n-util and used the latest icu4j version.
---
dev/release_files/LICENSE | 2 +-
phoenix-core/pom.xml | 8 +-
.../expression/function/CollationKeyFunction.java | 12 +-
.../phoenix/expression/function/LowerFunction.java | 3 +-
.../phoenix/expression/function/UpperFunction.java | 21 +-
.../apache/phoenix/util/DeferredStringBuilder.java | 135 +++
.../apache/phoenix/util/i18n/LinguisticSort.java | 1172 ++++++++++++++++++++
.../org/apache/phoenix/util/i18n/LocaleUtils.java | 86 ++
.../org/apache/phoenix/util/i18n/OracleUpper.java | 82 ++
.../apache/phoenix/util/i18n/OracleUpperTable.java | 337 ++++++
.../org/apache/phoenix/util/i18n/package-info.java | 27 +
.../phoenix/util/i18n/LinguisticSortTest.java | 650 +++++++++++
.../util/i18n/OracleUpperTableGeneratorTest.java | 391 +++++++
pom.xml | 13 +-
14 files changed, 2912 insertions(+), 27 deletions(-)
diff --git a/dev/release_files/LICENSE b/dev/release_files/LICENSE
index 4577518c7a..c3c68268f8 100644
--- a/dev/release_files/LICENSE
+++ b/dev/release_files/LICENSE
@@ -254,7 +254,7 @@ Janino Compiler (https://github.com/janino-compiler/janino)
Hamcrest-core 1.3 (http://www.hamcrest.org) Copyright (c) 2000-2006,
www.hamcrest.org
-i18n-util 1.0.1 (https://github.com/salesforce/i18n-util) Copyright (c) 2017,
Salesforce.com, Inc. All rights reserved.
+icu4j (https://github.com/unicode-org/icu) Copyright (c) 2016 and later
Unicode, Inc. and others. All Rights Reserved.
---
diff --git a/phoenix-core/pom.xml b/phoenix-core/pom.xml
index 5369b2b0e5..c8f7a18f30 100644
--- a/phoenix-core/pom.xml
+++ b/phoenix-core/pom.xml
@@ -516,8 +516,12 @@
<artifactId>stream</artifactId>
</dependency>
<dependency>
- <groupId>com.salesforce.i18n</groupId>
- <artifactId>i18n-util</artifactId>
+ <groupId>com.ibm.icu</groupId>
+ <artifactId>icu4j</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>com.ibm.icu</groupId>
+ <artifactId>icu4j-localespi</artifactId>
</dependency>
<dependency>
<groupId>com.lmax</groupId>
diff --git
a/phoenix-core/src/main/java/org/apache/phoenix/expression/function/CollationKeyFunction.java
b/phoenix-core/src/main/java/org/apache/phoenix/expression/function/CollationKeyFunction.java
index f5cbdc4557..676b6460df 100644
---
a/phoenix-core/src/main/java/org/apache/phoenix/expression/function/CollationKeyFunction.java
+++
b/phoenix-core/src/main/java/org/apache/phoenix/expression/function/CollationKeyFunction.java
@@ -35,11 +35,11 @@ import org.apache.phoenix.schema.types.PInteger;
import org.apache.phoenix.schema.types.PVarbinary;
import org.apache.phoenix.schema.types.PVarchar;
import org.apache.phoenix.util.VarBinaryFormatter;
+import org.apache.phoenix.util.i18n.LinguisticSort;
+import org.apache.phoenix.util.i18n.LocaleUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import com.force.db.i18n.LinguisticSort;
-import com.force.i18n.LocaleUtils;
/**
* A Phoenix Function that calculates a collation key for an input string based
@@ -51,10 +51,12 @@ import com.force.i18n.LocaleUtils;
* are all valid locale representations. Note the language code, country code
* and variant are used as arguments to the constructor of java.util.Locale.
*
- * This function uses the open-source i18n-util package to obtain the collators
- * it needs from the provided locale.
+ * This function originally used the open-source i18n-util package to obtain
the
+ * collators it needs from the provided locale. As i18n-util is not maintained
+ * anymore, the relevant parts from it were copied into Phoenix.
+ * See: https://issues.apache.org/jira/browse/PHOENIX-6818
*
- * The LinguisticSort implementation in i18n-util encapsulates sort-related
+ * The LinguisticSort implementation from i18n-util encapsulates sort-related
* functionality for a substantive list of locales. For each locale, it
provides
* a collator and an Oracle-specific database function that can be used to sort
* strings according to the natural language rules of that locale.
diff --git
a/phoenix-core/src/main/java/org/apache/phoenix/expression/function/LowerFunction.java
b/phoenix-core/src/main/java/org/apache/phoenix/expression/function/LowerFunction.java
index f444d36b5f..264ebfbb79 100644
---
a/phoenix-core/src/main/java/org/apache/phoenix/expression/function/LowerFunction.java
+++
b/phoenix-core/src/main/java/org/apache/phoenix/expression/function/LowerFunction.java
@@ -30,8 +30,7 @@ import org.apache.phoenix.parse.FunctionParseNode;
import org.apache.phoenix.schema.tuple.Tuple;
import org.apache.phoenix.schema.types.PDataType;
import org.apache.phoenix.schema.types.PVarchar;
-
-import com.force.i18n.LocaleUtils;
+import org.apache.phoenix.util.i18n.LocaleUtils;
@FunctionParseNode.BuiltInFunction(name=LowerFunction.NAME, args={
@FunctionParseNode.Argument(allowedTypes={PVarchar.class}),
diff --git
a/phoenix-core/src/main/java/org/apache/phoenix/expression/function/UpperFunction.java
b/phoenix-core/src/main/java/org/apache/phoenix/expression/function/UpperFunction.java
index 0969269ba6..56a228c6dd 100644
---
a/phoenix-core/src/main/java/org/apache/phoenix/expression/function/UpperFunction.java
+++
b/phoenix-core/src/main/java/org/apache/phoenix/expression/function/UpperFunction.java
@@ -1,11 +1,10 @@
/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
@@ -15,7 +14,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
package org.apache.phoenix.expression.function;
import java.io.DataInput;
@@ -25,15 +23,12 @@ import java.util.List;
import java.util.Locale;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
-
import org.apache.phoenix.expression.Expression;
import org.apache.phoenix.parse.FunctionParseNode;
+import org.apache.phoenix.schema.tuple.Tuple;
import org.apache.phoenix.schema.types.PDataType;
import org.apache.phoenix.schema.types.PVarchar;
-
-import com.force.i18n.LocaleUtils;
-
-import org.apache.phoenix.schema.tuple.Tuple;
+import org.apache.phoenix.util.i18n.LocaleUtils;
@FunctionParseNode.BuiltInFunction(name=UpperFunction.NAME, args={
@FunctionParseNode.Argument(allowedTypes={PVarchar.class}),
diff --git
a/phoenix-core/src/main/java/org/apache/phoenix/util/DeferredStringBuilder.java
b/phoenix-core/src/main/java/org/apache/phoenix/util/DeferredStringBuilder.java
new file mode 100644
index 0000000000..45dec5c22e
--- /dev/null
+++
b/phoenix-core/src/main/java/org/apache/phoenix/util/DeferredStringBuilder.java
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.phoenix.util;
+
+/**
+ * This utility class was partially copied from Salesforce's
internationalization utility library
+ * (com.salesforce.i18n:i18n-util:1.0.4), which was released under the
3-clause BSD License.
+ * The i18n-util library is not maintained anymore, and it was using
vulnerable dependencies.
+ * For more info, see: https://issues.apache.org/jira/browse/PHOENIX-6818
+ *
+ * This class implements a StringBuilder that is incrementally copied from a
source String.
+ * Actual creation the new buffer is deferred until a character differs from a
character at
+ * the same position in the source String. This class is useful for reducing
garbage creation
+ * when doing operations like escaping a String, when most Strings are not
expected to contain
+ * any escapable characters. In that case, no additional memory is used (as
the original
+ * String is not actually copied).
+ */
+public final class DeferredStringBuilder implements Appendable, CharSequence {
+
+ private StringBuilder buf;
+ private int pos;
+ private final CharSequence source;
+
+ public DeferredStringBuilder(CharSequence source) {
+ if (source == null) {
+ this.buf = new StringBuilder(16);
+ }
+ this.source = source;
+ }
+
+ public DeferredStringBuilder append(char c) {
+ if (this.buf == null) {
+ if (this.pos < this.source.length() && c ==
this.source.charAt(this.pos)) {
+ // characters match - just move ahead
+ ++this.pos;
+ } else {
+ // doh - character mismatch - now we need to allocate a real
StringBuilder
+ this.buf = new StringBuilder(this.source.length() + 16);
+ this.buf.append(this.source.subSequence(0, this.pos));
+ this.buf.append(c);
+ }
+ } else {
+ // we've already got the buf - just add this character
+ this.buf.append(c);
+ }
+ return this;
+ }
+
+ public DeferredStringBuilder append(CharSequence csq) {
+ if (csq == null) {
+ return this;
+ }
+ return append(csq, 0, csq.length());
+ }
+
+ public DeferredStringBuilder append(CharSequence csq, int start, int end) {
+ if (csq != null) {
+ if (buf == null) {
+ int chars = end - start;
+ // For small strings or overflow, do it char by char.
+ if (chars < 10 || (this.pos + chars > this.source.length())) {
+ for (int i = start; i < end; ++i) {
+ append(csq.charAt(i));
+ }
+ } else {
+ CharSequence subSeq = csq.subSequence(start, end);
+ //String.equals seems to get optimized a lot quicker than
the
+ // chartA + length + loop method. I don't think this will
matter at all,
+ // but between this and OptimizedURLEncoder, this made
these classes
+ // disappear from my profiler
+ if (this.source.subSequence(this.pos, this.pos +
chars).equals(subSeq)) {
+ this.pos += chars;
+ } else {
+ this.buf = new StringBuilder(this.source.length() +
16);
+ this.buf.append(this.source.subSequence(0, this.pos));
+ this.buf.append(subSeq);
+ }
+ }
+ } else {
+ // We know it's different, so just append the whole string.
+ buf.append(csq, start, end);
+ }
+ }
+ return this;
+ }
+
+ public char charAt(int index) {
+ if (this.buf != null) {
+ return this.buf.charAt(index);
+ } else if (index < pos) {
+ return this.source.charAt(index);
+ } else {
+ throw new StringIndexOutOfBoundsException(index);
+ }
+ }
+
+ public CharSequence subSequence(int start, int end) {
+ if (this.buf != null) {
+ return this.buf.subSequence(start, end);
+ } else if (end <= pos) {
+ return this.source.subSequence(start, end);
+ } else {
+ throw new StringIndexOutOfBoundsException(end);
+ }
+ }
+
+ @Override
+ public String toString() {
+ if (this.buf != null) {
+ return this.buf.toString();
+ }
+ if (this.pos == this.source.length()) {
+ return this.source.toString();
+ }
+ return this.source.subSequence(0, this.pos).toString();
+ }
+
+ public int length() {
+ return this.buf != null ? this.buf.length() : this.pos;
+ }
+}
diff --git
a/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/LinguisticSort.java
b/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/LinguisticSort.java
new file mode 100644
index 0000000000..c1881c6440
--- /dev/null
+++
b/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/LinguisticSort.java
@@ -0,0 +1,1172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.phoenix.util.i18n;
+
+import java.text.CollationKey;
+import java.text.Collator;
+import java.text.MessageFormat;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+
+import org.apache.phoenix.util.DeferredStringBuilder;
+
+import com.ibm.icu.impl.jdkadapter.CollatorICU;
+import com.ibm.icu.text.AlphabeticIndex;
+import com.ibm.icu.util.ULocale;
+
+import edu.umd.cs.findbugs.annotations.SuppressWarnings;
+
+
+/**
+ * This utility class was partially copied from Salesforce's
internationalization utility library
+ * (com.salesforce.i18n:i18n-util:1.0.4), which was released under the
3-clause BSD License.
+ * The i18n-util library is not maintained anymore, and it was using
vulnerable dependencies.
+ * For more info, see: https://issues.apache.org/jira/browse/PHOENIX-6818
+ *
+ * Contains all the information about linguistic sorting.
+ * The intent of this is to provide the SQL changes to the RDBMS to ensure
+ * that the sorting uses the locale provided in Java, and to make sure that
+ * the collation in Java will correspond as much as possible to what is in the
+ * DB.
+ *
+ * Rolodex is a feature in alphabetic/syllabary languages to restrict the set
+ * of rows in a list to those that start with a certain letter. In SQL
+ * this is usually LIKE 'A%', which will include different letters.
+ *
+ * To get the list of valid nls_sorts, run this in oracle
+ * select value from v$nls_valid_values where parameter='SORT';
+ */
+public enum LinguisticSort {
+ // English:
+ // Using oracle's upper() function to sort; digits come before letters,
+ // '[' is the lowest character after 'Z'. // balance-]
+ ENGLISH(Locale.ENGLISH, "[", false, false,
LinguisticSort.Alphabets.STRING), // balance-]
+
+ // German:
+ // Using oracle's nlssort() function to sort; digits come right after
letters.
+ GERMAN(new Locale("de"), LinguisticSort.Alphabets.GERMAN, "0", true, false,
+ "nlssort({0}, ''nls_sort=xgerman'')"),
+
+ // French:
+ // Using oracle's nlssort() function to sort; digits come right after
letters.
+ FRENCH(new Locale("fr"), "0", false, false, "nlssort({0},
''nls_sort=xfrench'')"),
+
+ // Italian:
+ // Using oracle's nlssort() function to sort; digits come right after
letters.
+ ITALIAN(new Locale("it"), "0", false, false, "nlssort({0},
''nls_sort=italian'')"),
+
+ // Spanish:
+ // Using oracle's nlssort() function to sort; digits come right after
letters.
+ // Alphabet consists of A-Z plus N-tilde. However, CH and LL are not
considered
+ // letters, so do not use Oracle's xspanish nlssort.
+ SPANISH(new Locale("es"), "0", false, false, "nlssort({0},
''nls_sort=spanish'')"),
+
+ // Catalan:
+ // Using oracle's nlssort() function to sort; digits come before letters,
+ // nothing sorts after the last legal catalan character.
+ CATALAN(new Locale("ca"), LinguisticSort.Alphabets.CATALAN, "0", true,
false,
+ "nlssort({0}, ''nls_sort=catalan'')"),
+
+ // Dutch:
+ // Using oracle's nlssort() function to sort; digits come right after
letters.
+ DUTCH(new Locale("nl"), "0", false, false, "nlssort({0},
''nls_sort=dutch'')"),
+
+ // Portuguese:
+ // Using oracle's nlssort() function to sort; digits come right after
letters.
+ PORTUGUESE(new Locale("pt"), "0", false, false, "nlssort({0},
''nls_sort=west_european'')"),
+
+ // Danish:
+ // Alphabet consists of A-Z followed by AE, O-stroke, and A-ring.
+ // Using oracle's nlssort() function to sort; digits come right after
letters.
+ DANISH(new Locale("da"), "0", false, false, "nlssort({0},
''nls_sort=danish'')"),
+
+ // Norwegian:
+ // Alphabet consists of A-Z followed by AE, O-stroke, and A-ring.
+ // Using oracle's nlssort() function to sort; digits come right after
letters.
+ NORWEGIAN(new Locale("no"), "0", false, false,
+ "nlssort({0}, ''nls_sort=norwegian'')"),
+
+ // Swedish:
+ // Alphabet consists of A-Z followed by A-ring, A-diaeresis, and
O-diaeresis.
+ // Using oracle's nlssort() function to sort; digits come before letters,
+ // nothing sorts after the last legal swedish character.
+ SWEDISH(new Locale("sv"), null, false, false,
+ "nlssort({0}, ''nls_sort=swedish'')"),
+
+ // Finnish:
+ // Alphabet consists of A-Z, minus W, followed by A-ring, A-diaeresis,
and O-diaeresis.
+ // We leave out W so that V's show up properly (bug #151961/W-513969)
+ // Using oracle's nlssort() function to sort; digits come right after
letters.
+ FINNISH(new Locale("fi"),
+ new String[] {
+ "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L",
"M", "N", "O", "P",
+ "Q", "R", "S", "T", "U", "V", "X", "Y", "Z", "\u00C5",
"\u00C4", "\u00D6" },
+ "0", false, false, "nlssort({0}, ''nls_sort=finnish'')"),
+
+ // Czech:
+ // Alphabet consists of many Czech letters but not all english letters.
+ // Using oracle's nlssort() function to sort; digits come right after
letters.
+ CZECH(new Locale("cs"), "0", true, false,
+ "nlssort({0}, ''nls_sort=xczech'')"),
+
+ // Polish:
+ // Alphabet consists of many Polish letters but not all english letters.
+ // Using oracle's nlssort() function to sort.
+ POLISH(new Locale("pl"), "\u00DF", false, false,
+ "nlssort({0}, ''nls_sort=polish'')"),
+
+ // Turkish:
+ // Use Turkish alphabet, which also indicates special handling in
getUpperCaseValue().
+ // Using oracle's nlssort() function to sort.
+ TURKISH(new Locale("tr"), LinguisticSort.Alphabets.TURKISH, null, false,
false,
+ "nlssort({0}, ''nls_sort=xturkish'')"),
+
+ // Traditional chinese:
+ // Use English alphabet. Using oracle's nlssort() function to sort by
stroke.
+ CHINESE_HK(new Locale("zh", "HK"), LinguisticSort.Alphabets.ENGLISH,
"\u03B1", true, true,
+ "nlssort({0}, ''nls_sort=tchinese_radical_m'')"),
+ CHINESE_HK_STROKE(new Locale("zh", "HK", "STROKE"),
LinguisticSort.Alphabets.ENGLISH, "\u03B1",
+ true, true, "nlssort({0}, ''nls_sort=tchinese_stroke_m'')"),
+
+ CHINESE_TW(new Locale("zh", "TW"), LinguisticSort.Alphabets.ENGLISH,
"\u03B1", true, true,
+ "nlssort({0}, ''nls_sort=tchinese_radical_m'')"),
+ CHINESE_TW_STROKE(new Locale("zh", "TW", "STROKE"),
LinguisticSort.Alphabets.ENGLISH, "\u03B1",
+ true, true, "nlssort({0}, ''nls_sort=tchinese_stroke_m'')"),
+
+
+ // Simplified chinese:
+ // Use English alphabet. Using oracle's nlssort() function to sort by
pinyin.
+ CHINESE(new Locale("zh"), LinguisticSort.Alphabets.ENGLISH, "\u03B1",
true, true,
+ "nlssort({0}, ''nls_sort=schinese_radical_m'')"),
+ CHINESE_STROKE(new Locale("zh", "", "STROKE"),
LinguisticSort.Alphabets.ENGLISH, "\u03B1",
+ true, true,
+ "nlssort({0}, ''nls_sort=schinese_stroke_m'')"),
+ CHINESE_PINYIN(new Locale("zh", "", "PINYIN"),
LinguisticSort.Alphabets.ENGLISH, "\u03B1",
+ true, true,
+ "nlssort({0}, ''nls_sort=schinese_pinyin_m'')"),
+
+
+ // Japanese:
+ // Japanese alphabet. Using oracle's nlssort() function to sort. Special
rolodex handling
+ JAPANESE(new Locale("ja"), LinguisticSort.Alphabets.JAPANESE, null, true,
true,
+ "nlssort({0}, ''nls_sort=japanese_m'')"),
+
+ // Korean:
+ // Use English alphabet. Using oracle's nlssort() function to sort.
+ KOREAN(new Locale("ko"), LinguisticSort.Alphabets.ENGLISH, "\u03B1", true,
true,
+ "nlssort({0}, ''nls_sort=korean_m'')"),
+
+ // Russian:
+ // Using oracle's nlssort() function to sort.
+ RUSSIAN(new Locale("ru"), null, false, false,
+ "nlssort({0}, ''nls_sort=russian'')"),
+
+ // Bulgarian:
+ // Using oracle's nlssort() function to sort.
+ BULGARIAN(new Locale("bg"), LinguisticSort.Alphabets.BULGARIAN, null,
true, false,
+ "nlssort({0}, ''nls_sort=bulgarian'')"),
+
+ // Indonesian
+ // Using oracle's nlssort() function to sort.
+ INDONESIAN(new Locale("in"), null, true, false, "nlssort({0},
''nls_sort=indonesian'')"),
+
+ // Romanian:
+ // Using oracle's nlssort() function to sort.
+ ROMANIAN(new Locale("ro"),
+ new String[] { "A", "\u0102", "\u00c2", "B", "C", "D", "E", "F",
"G", "H", "I",
+ "\u00ce", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S",
"\u015e", "T",
+ "\u0162", "U", "V", "W", "X", "Y", "Z" },
+ null, true, false, "nlssort({0}, ''nls_sort=romanian'')"),
+
+ // Vietnamese
+ // Using oracle's nlssort() function to sort.
+ VIETNAMESE(new Locale("vi"),
+ new String[] {
+ "A", "\u0102", "\u00c2", "B", "C", "D", "\u0110", "E",
"\u00ca", "G", "H",
+ "I", "K", "L", "M", "N", "O", "\u00d4", "\u01a0", "P", "Q",
"R", "S", "T",
+ "U", "\u01af", "V", "X", "Y" },
+ null, false, false, "nlssort({0}, ''nls_sort=vietnamese'')"),
+
+ // Ukrainian:
+ // Using oracle's nlssort() function to sort.
+ UKRAINIAN(new Locale("uk"), null, false, false, "nlssort({0},
''nls_sort=ukrainian'')"),
+
+ // Hungarian:
+ // Using oracle's nlssort() function to sort.
+ HUNGARIAN(new Locale("hu"), LinguisticSort.Alphabets.HUNGARIAN, null,
false, false,
+ "nlssort({0}, ''nls_sort=xhungarian'')"),
+
+ // Greek:
+ // Using oracle's nlssort() function to sort.
+ GREEK(new Locale("el"), null, false, false, "nlssort({0},
''nls_sort=greek'')"),
+
+ // Hebrew:
+ // Using oracle's nlssort() function to sort.
+ HEBREW(new Locale("iw"), null, true, false, "nlssort({0},
''nls_sort=hebrew'')"),
+
+ // Slovak:
+ // Using oracle's nlssort() function to sort.
+ SLOVAK(new Locale("sk"), LinguisticSort.Alphabets.SLOVAK, null, true,
false,
+ "nlssort({0}, ''nls_sort=slovak'')"),
+
+ // Serbian (cyrillic):
+ // Using oracle's nlssort() function to sort using it's default
+ SERBIAN_CYRILLIC(new Locale("sr"), null, false, false,
+ "nlssort({0}, ''nls_sort=generic_m'')"),
+
+ // Serbian (cyrillic):
+ // Using oracle's nlssort() function to sort using it's default
+ SERBIAN_LATIN(new Locale("sh"), LinguisticSort.Alphabets.SERBIAN_LATIN,
null, false, false,
+ "nlssort({0}, ''nls_sort=xcroatian'')"),
+
+ // Serbian (cyrillic):
+ // Using oracle's nlssort() function to sort using it's default
+ BOSNIAN(new Locale("bs"), LinguisticSort.Alphabets.SERBIAN_LATIN, null,
false, false,
+ "nlssort({0}, ''nls_sort=xcroatian'')"),
+
+
+ // Georgian:
+ // Using oracle's nlssort() function to sort, even though we're using
binary for this.
+ GEORGIAN(new Locale("ka"), LinguisticSort.Alphabets.GEORGIAN, null, false,
false,
+ "nlssort({0}, ''nls_sort=binary'')"),
+
+ // BASQUE:
+ // Using oracle's nlssort() function to sort.
+ BASQUE(new Locale("eu"), LinguisticSort.Alphabets.BASQUE, null, false,
false,
+ "nlssort({0}, ''nls_sort=west_european'')"),
+
+ // MALTESE:
+ // Using oracle's nlssort() function to sort.
+ MALTESE(new Locale("mt"), null, false, false, "nlssort({0},
''nls_sort=west_european'')"),
+
+ // ROMANSH:
+ // Using oracle's nlssort() function to sort.
+ ROMANSH(new Locale("rm"), null, false, false, "nlssort({0},
''nls_sort=west_european'')"),
+
+ // LUXEMBOURGISH:
+ // Using oracle's nlssort() function to sort.
+ LUXEMBOURGISH(new Locale("lb"), LinguisticSort.Alphabets.LUXEMBOURGISH,
null, false, false,
+ "nlssort({0}, ''nls_sort=west_european'')"),
+
+ // IRISH:
+ // Using oracle's nlssort() function to sort.
+ IRISH(new Locale("ga"), null, false, false, "nlssort({0},
''nls_sort=west_european'')"),
+
+ // Slovenian:
+ // Using oracle's nlssort() function to sort.
+ SLOVENE(new Locale("sl"), LinguisticSort.Alphabets.SLOVENE, null, false,
false,
+ "nlssort({0}, ''nls_sort=xslovenian'')"),
+
+ // Croatian:
+ // Using oracle's nlssort() function to sort.
+ CROATIAN(new Locale("hr"), LinguisticSort.Alphabets.SERBIAN_LATIN, null,
false, false,
+ "nlssort({0}, ''nls_sort=xcroatian'')"),
+
+ // Malay
+ // Using oracle's nlssort() function to sort.
+ // We're assuming people are using the english alphabet,
+ // and not the arabic one (Bahasa Melayu)
+ MALAY(new Locale("ms"), null, true, false, "nlssort({0},
''nls_sort=malay'')"),
+
+ // Arabic:
+ // Using oracle's nlssort() function to sort.
+ ARABIC(new Locale("ar"), null, false, false, "nlssort({0},
''nls_sort=arabic'')"),
+
+ // Estonian:
+ // Using oracle's nlssort() function to sort.
+ ESTONIAN(new Locale("et"), LinguisticSort.Alphabets.ESTONIAN, null, true,
false,
+ "nlssort({0}, ''nls_sort=estonian'')"),
+
+ // Icelandic:
+ // Using oracle's nlssort() function to sort.
+ ICELANDIC(new Locale("is"), LinguisticSort.Alphabets.ICELANDIC, null,
true, false,
+ "nlssort({0}, ''nls_sort=icelandic'')"),
+
+ // Latvian:
+ // Using oracle's nlssort() function to sort.
+ LATVIAN(new Locale("lv"), LinguisticSort.Alphabets.LATVIAN, null, false,
false,
+ "nlssort({0}, ''nls_sort=latvian'')"),
+
+ // Lithuanian:
+ // Using oracle's nlssort() function to sort.
+ LITHUANIAN(new Locale("lt"), LinguisticSort.Alphabets.LITHUANIAN, null,
false, false,
+ "nlssort({0}, ''nls_sort=lithuanian'')"),
+
+
+ // Languages not supported fully.
+ KYRGYZ(new Locale("ky"), LinguisticSort.Alphabets.KYRGYZ, null, true,
false,
+ "nlssort({0}, ''nls_sort=binary'')"),
+
+ KAZAKH(new Locale("kk"), LinguisticSort.Alphabets.KAZAKH, null, true,
false,
+ "nlssort({0}, ''nls_sort=binary'')"),
+
+ TAJIK(new Locale("tg"), LinguisticSort.Alphabets.TAJIK, null, true, false,
+ "nlssort({0}, ''nls_sort=russian'')"),
+
+ BELARUSIAN(new Locale("be"), null, true, false, "nlssort({0},
''nls_sort=russian'')"),
+
+ TURKMEN(new Locale("tk"), LinguisticSort.Alphabets.TURKISH, null, false,
false,
+ "nlssort({0}, ''nls_sort=xturkish'')"),
+
+ AZERBAIJANI(new Locale("az"), LinguisticSort.Alphabets.AZERBAIJANI, null,
false, false,
+ "nlssort({0}, ''nls_sort=xturkish'')"),
+
+ ARMENIAN(new Locale("hy"), null, true, false, "nlssort({0},
''nls_sort=binary'')"),
+
+ THAI(new Locale("th"), null, true, false, "nlssort({0},
''nls_sort=thai_dictionary'')"),
+
+ // Binary? really
+ HINDI(new Locale("hi"), null, true, false, "nlssort({0},
''nls_sort=binary'')"),
+
+ URDU(new Locale("ur"), LinguisticSort.Alphabets.URDU, null, false, false,
+ "nlssort({0}, ''nls_sort=arabic'')"),
+
+ // Bengali
+ BENGALI(new Locale("bn"), LinguisticSort.Alphabets.BENGALI, null, true,
false,
+ "nlssort({0}, ''nls_sort=bengali'')"),
+
+ TAMIL(new Locale("ta"), LinguisticSort.Alphabets.TAMIL, null, true, false,
+ "nlssort({0}, ''nls_sort=binary'')"),
+
+ // Unused language for testing; Alphabet and sorting defaults to English
+ ESPERANTO(new Locale("eo"), LinguisticSort.Alphabets.ENGLISH, "[", false,
false,
+ LinguisticSort.Alphabets.STRING);
+
+ private static final Map<Locale, LinguisticSort> BY_LOCALE =
getByLocaleInfo();
+
+ /**
+ * Create the map that will be stuffed into BY_LOCALE. We have to fully
create an object
+ * THEN stuff into a final field in a constructor (as unmodifiableMap does
below) in order
+ * to get a proper guarantee from Java's memory model.
+ *
+ * See
http://jeremymanson.blogspot.com/2008/07/immutability-in-java-part-2.html
+ */
+ private static Map<Locale, LinguisticSort> getByLocaleInfo() {
+ final Map<Locale, LinguisticSort> byLocaleInfo = new HashMap<Locale,
LinguisticSort>(64);
+ for (LinguisticSort sort : values()) {
+ LinguisticSort duplicated = byLocaleInfo.put(sort.getLocale(),
sort);
+ assert duplicated == null : "Two linguistic sorts with the same
locale: "
+ + sort.getLocale();
+ }
+ return Collections.unmodifiableMap(byLocaleInfo);
+ }
+
+ /**
+ * Get sorting info for the given locale.
+ */
+ public static LinguisticSort get(Locale locale) {
+ // For non-UTF8 dbs, we always interpret everything as English. (We
did not set
+ // the page encoding to UTF-8, and thus we may have incorrectly
encoded data.)
+ // On all other instances, look for the language of the user's locale.
This should
+ // succeed because every language we support are listed in data. But
just in case,
+ // default to english also.
+ if (IS_MULTI_LINGUAL /*|| TestContext.isRunningTests()*/) {
+ LinguisticSort sort = BY_LOCALE.get(locale);
+ if (sort != null) {
+ return sort;
+ }
+ if (locale.getVariant().length() > 0) {
+ if ("zh".equals(locale.getLanguage())) {
+ // TW and HK are handled above, this handles SG
+ if (!"".equals(locale.getLanguage())) {
+ // This means it's standard.
+ return get(new Locale(locale.getLanguage(), "",
locale.getVariant()));
+ }
+ }
+ return get(new Locale(locale.getLanguage(),
locale.getLanguage()));
+ }
+ if (locale.getCountry().length() > 0) {
+ sort = BY_LOCALE.get(new Locale(locale.getLanguage()));
+ if (sort != null) {
+ return sort;
+ }
+ }
+ }
+ return ENGLISH;
+ }
+
+ /**
+ * The locale for this LinguisticSort instance.
+ */
+ private final Locale locale;
+
+ /**
+ * Collator for this LinguisticSort instance. This may be different than
the
+ * default collator for its locale. This is to better match Oracle's nls
sort
+ * ordering.
+ */
+ private final Collator collator;
+
+ /**
+ * Array of letters (Strings) to show in the rolodex. An empty array for
+ * alphabet means that the rolodex is not supported for the locale.
+ */
+ private final String[] alphabet;
+
+ /**
+ * An optional String that sorts higher than all letters in the alphabet.
+ * Used when the generating rolodex sql for the last letter.
+ */
+ private final String highValue;
+
+ /**
+ * True normal secondary sorting is reversed, ie, if lower case letters
+ * are sorted before upper case.
+ */
+ private final boolean reverseSecondary;
+
+ /**
+ * True if the locale has double width alphabet, number or symbols,
+ * So we use Oracle's to_single_byte to convert into the half width letter.
+ */
+ private final boolean hasDoubleWidth;
+
+ /**
+ * A MessageFormat pattern for generating an oracle sql expression
returning the
+ * collation key for sorting a sql expression. Not used by postgres.
+ */
+ private final String collationKeySql;
+
+ /**
+ * For upper-casing Java values and generating SQL to generate the same.
Not used by postgres.
+ */
+ private final OracleUpperTable upper;
+
+ /**
+ * Constructor only used when building static data, where ICU should be
used to derive the
+ * value for the alphabet
+ */
+ LinguisticSort(Locale locale, String highValue, boolean reverseSecondary,
+ boolean hasDoubleWidth, String collationKeySql) {
+ this(locale, getAlphabetFromICU(locale), highValue, reverseSecondary,
+ hasDoubleWidth, collationKeySql);
+ }
+
+ /**
+ * Mapping for locales and ULocale language tags to use for constructing
an ICU4J collator.
+ * javac complains if we attempt to refer to a static defined inside the
same class as an enum,
+ * so we need to use an inner class to have such a constant mapping.
+ */
+ private static final class Icu4jCollatorOverrides {
+ static final Map<Locale, String> OVERRIDES =
getIcu4jCollatorOverrides();
+
+ /**
+ * ICU4J collator overrides
+ */
+ private static Map<Locale, String> getIcu4jCollatorOverrides() {
+ // Map between a Locale and a BCP47 language tag to use when
calling ICU4J's
+ // Collator.getInstance(ULocale.forLanguageTag()).
+ Map<Locale, String> overrides = new HashMap<Locale, String>(7);
+
+ // Built-in JDK collators for Chinese are behind the Unicode
standard, so we need to
+ // override them. See discussion at
+ // https://stackoverflow.com/questions/33672422
+ // /wrong-sorting-with-collator-using-locale-simplified-chinese
+ // Also see the following JDK collator bugs:
+ // https://bugs.openjdk.java.net/browse/JDK-6415666
+ // https://bugs.openjdk.java.net/browse/JDK-2143916
+ // https://bugs.openjdk.java.net/browse/JDK-6411864
+
+ // CHINESE_HK:
+ overrides.put(new Locale("zh", "HK"), "zh-HK-u-co-unihan");
+ // CHINESE_HK_STROKE:
+ overrides.put(new Locale("zh", "HK", "STROKE"),
"zh-HK-u-co-stroke");
+ // CHINESE_TW:
+ overrides.put(new Locale("zh", "TW"), "zh-TW-u-co-unihan");
+ // CHINESE_TW_STROKE:
+ overrides.put(new Locale("zh", "TW", "STROKE"),
"zh-TW-u-co-stroke");
+ // CHINESE:
+ overrides.put(new Locale("zh"), "zh-CN-u-co-unihan");
+ // CHINESE_STROKE:
+ overrides.put(new Locale("zh", "", "STROKE"), "zh-CN-u-co-stroke");
+ // CHINESE_PINYIN:
+ overrides.put(new Locale("zh", "", "PINYIN"), "zh-CN-u-co-pinyin");
+
+ return Collections.unmodifiableMap(overrides);
+ }
+ }
+
+ /**
+ * Constructor only used when building static data
+ */
+ LinguisticSort(Locale locale, String[] alphabet, String highValue, boolean
reverseSecondary,
+ boolean hasDoubleWidth, String collationKeySql) {
+ this.locale = locale;
+ this.alphabet = alphabet;
+ this.highValue = highValue;
+ assert this.highValue == null || this.highValue.length() == 1;
+ this.reverseSecondary = reverseSecondary;
+ this.hasDoubleWidth = hasDoubleWidth;
+ this.collationKeySql = collationKeySql;
+ // Construct collator for this locale
+ if
(LinguisticSort.Icu4jCollatorOverrides.OVERRIDES.containsKey(this.locale)) {
+ // Force ICU4J collators for specific locales so they match Oracle
sort
+ this.collator =
CollatorICU.wrap(com.ibm.icu.text.Collator.getInstance(
+ ULocale.forLanguageTag(LinguisticSort
+
.Icu4jCollatorOverrides.OVERRIDES.get(this.locale))));
+ } else if (this.locale.getVariant().length() > 0) {
+ // If there's a variant, use ICU4J to figure it out.
+ this.collator =
CollatorICU.wrap(com.ibm.icu.text.Collator.getInstance(
+ ULocale.forLocale(this.locale)));
+ } else {
+ this.collator = Collator.getInstance(this.locale);
+ }
+ this.collator.setStrength(Collator.SECONDARY);
+ this.upper = OracleUpperTable.forLinguisticSort(name());
+ }
+
+ /**
+ * @return a new collator for this LinguisticSort instance.
+ */
+ public Collator getCollator() {
+ // Since RuleBasedCollator.compare() is synchronized, it is not nice
to return
+ // this.collator here, because that would mean requests for the same
language
+ // will be waiting for each other. Instead, return a clone. And,
cloning
+ // RuleBasedCollator instances is much more efficient than creating
one from
+ // the rules.
+ return (Collator) this.collator.clone();
+ }
+
+ /**
+ * @return a new collator for this LinguisticSort instance that is
guaranteed to be
+ * case-insensitive. Danish collation, unfortunately, is a little odd, in
that "v"
+ * and "w" are considered to be the same character. To make up for this,
they made
+ * "v" and "V" a secondary difference, which makes Enum comparisons in
FilterItem
+ * a little wonky.
http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4174436
+ */
+ public Collator getGuaranteedCaseInsensitiveCollator() {
+ Collator collator = getCollator();
+ if ("da".equals(this.locale.getLanguage())) {
+ collator.setStrength(Collator.PRIMARY);
+ }
+ return collator;
+ }
+
+ Locale getLocale() {
+ return this.locale;
+ }
+
+ /**
+ * @return a new comparator for strings for this LinguisticSort instance.
+ */
+ @SuppressWarnings("unchecked")
+ // Converting from Comparator<Object> to Comparator<String>
+ public Comparator<String> getNonCachingComparator() {
+ return (Comparator<String>) this.collator.clone();
+ }
+
+ /**
+ * @return a new comparator for strings for this LinguisticSort instance.
+ * @param size the number of elements to compare (default is 16).
+ */
+ public Comparator<String> getComparator(int size) {
+ return new LinguisticSort.CollatingComparator(getCollator(), size);
+ }
+
+ /**
+ * A String comparator that uses the current collation
+ */
+ static class CollatingComparator implements Comparator<String> {
+ private final Collator collator;
+ private final Map<String, CollationKey> cKeyMap;
+
+ CollatingComparator(Collator collator) {
+ this(collator, 16);
+ }
+
+ CollatingComparator(Collator collator, int defaultSize) {
+ this.collator = collator;
+ cKeyMap = new HashMap<>(defaultSize);
+ }
+
+ @SuppressWarnings(
+ value = "ES_COMPARING_PARAMETER_STRING_WITH_EQ",
+ justification = "Reference comparison used for performance
improvement.")
+ public int compare(String o1, String o2) {
+ if (o1 == o2) {
+ return 0;
+ } else if (o2 == null) {
+ return 1;
+ } else if (o1 == null) {
+ return -1;
+ }
+
+ return getCollationKey(o1).compareTo(getCollationKey(o2));
+ }
+
+ private CollationKey getCollationKey(String comp) {
+ CollationKey key = cKeyMap.get(comp);
+ if (key == null) {
+ key = collator.getCollationKey(comp);
+ cKeyMap.put(comp, key);
+ }
+ return key;
+ }
+ }
+
+ /**
+ * Returns the number of letters to show in the rolodex.
+ */
+ public int getAlphabetLength() {
+ return this.alphabet.length;
+ }
+
+ /**
+ * Returns the n-th of letter in the rolodex. Note, a 'letter'
+ * in a language be composed of more than one unicode characters,
+ * for example, letter 'ch' in Czech.
+ */
+ public String getAlphabet(int index) {
+ return this.alphabet[index];
+ }
+
+ // Used only for test code
+ String[] getAlphabet() {
+ return this.alphabet;
+ }
+
+ /**
+ * Return the rolodexIndex for a string.
+ *
+ * @param searchTerm Must be a 1-char string
+ * @return the rolodexIndex, including Other (i.e. getAlphabetLength) if
it doesn't
+ * fall into a bucket. If this language doesn't have a rolodex
(e.g. Arabic,
+ * Latvian, etc.) return -1
+ * @throws IllegalArgumentException if the string is null or not of length
1
+ */
+ public int getRolodexIndexForChar(String searchTerm) {
+ if (searchTerm == null || searchTerm.length() != 1) {
+ throw new IllegalArgumentException("Must be a one-length string");
+ }
+
+ if (this.getAlphabetLength() == 0) {
+ return -1;
+ }
+
+ for (int i = 0; i < this.getAlphabetLength(); i++) {
+ int comparison = this.collator.compare(searchTerm,
this.getAlphabet(i));
+
+ if (comparison < 0) {
+ //If it's less than 'a', return Other
+ //Otherwise, it's less than the current index, but it wasn't 0
on the
+ // previous comparison, so return the previous rolodex letter.
+ return i == 0 ? this.getAlphabetLength() : (i - 1);
+ } else if (comparison == 0) {
+ return i;
+ }
+ }
+ return this.getAlphabetLength();
+ }
+
+ /**
+ * Returns the sql expression to convert the given sql expression to upper
case.
+ */
+ public String getUpperCaseSql(String expr, boolean isPostgres) {
+ if (isPostgres) {
+ return "icu_upper(" + expr + ",'" + this.locale.toString() + "')";
+ } else {
+ return upper.getSql(expr);
+ }
+ }
+
+ /**
+ * @return true if sql UPPER() is used in getUpperCaseSql(). Note that
this is always false
+ * for postgres because postgres always use the icu_upper()
function for all languages.
+ */
+ public boolean usesUpperToGetUpperCase(boolean isPostgres) {
+ return !isPostgres && "upper(x)".equals(upper.getSql("x"));
+ }
+
+ /**
+ * Returns the upper case value of the given value, or what would be the
result
+ * of applying the sql expression in getUpperCaseSql() to the given value.
+ */
+ public String getUpperCaseValue(String value, boolean isPostgres) {
+ String singleWidth = value;
+ if (this.hasDoubleWidth) {
+ singleWidth = toSingleWidth(value);
+ }
+ if (isPostgres) {
+ return singleWidth.toUpperCase(this.locale);
+ } else {
+ return upper.toUpperCase(singleWidth);
+ }
+ }
+
+ private static final char[][] DOUBLE_TO_SINGLE = new char[256][];
+ static {
+ DOUBLE_TO_SINGLE[0x20] = new char[256];
+ DOUBLE_TO_SINGLE[0x20][0x18] = '`';
+ DOUBLE_TO_SINGLE[0x20][0x19] = '\'';
+ DOUBLE_TO_SINGLE[0x20][0x1D] = '"';
+
+ DOUBLE_TO_SINGLE[0x22] = new char[256];
+ DOUBLE_TO_SINGLE[0x22][0x3C] = '~';
+
+ DOUBLE_TO_SINGLE[0x30] = new char[256];
+ DOUBLE_TO_SINGLE[0x30][0x00] = ' ';
+
+ DOUBLE_TO_SINGLE[0xFE] = new char[256];
+ DOUBLE_TO_SINGLE[0xFE][0x3F] = '^';
+
+ DOUBLE_TO_SINGLE[0xFF] = new char[256];
+ DOUBLE_TO_SINGLE[0xFF][0x01] = '!';
+ DOUBLE_TO_SINGLE[0xFF][0x03] = '#';
+ DOUBLE_TO_SINGLE[0xFF][0x04] = '$';
+ DOUBLE_TO_SINGLE[0xFF][0x05] = '%';
+ DOUBLE_TO_SINGLE[0xFF][0x06] = '&';
+ DOUBLE_TO_SINGLE[0xFF][0x08] = '(';
+ DOUBLE_TO_SINGLE[0xFF][0x09] = ')';
+ DOUBLE_TO_SINGLE[0xFF][0x0A] = '*';
+ DOUBLE_TO_SINGLE[0xFF][0x0B] = '+';
+ DOUBLE_TO_SINGLE[0xFF][0x0C] = ',';
+ DOUBLE_TO_SINGLE[0xFF][0x0D] = '-';
+ DOUBLE_TO_SINGLE[0xFF][0x0E] = '.';
+ DOUBLE_TO_SINGLE[0xFF][0x0F] = '/';
+ DOUBLE_TO_SINGLE[0xFF][0x10] = '0';
+ DOUBLE_TO_SINGLE[0xFF][0x11] = '1';
+ DOUBLE_TO_SINGLE[0xFF][0x12] = '2';
+ DOUBLE_TO_SINGLE[0xFF][0x13] = '3';
+ DOUBLE_TO_SINGLE[0xFF][0x14] = '4';
+ DOUBLE_TO_SINGLE[0xFF][0x15] = '5';
+ DOUBLE_TO_SINGLE[0xFF][0x16] = '6';
+ DOUBLE_TO_SINGLE[0xFF][0x17] = '7';
+ DOUBLE_TO_SINGLE[0xFF][0x18] = '8';
+ DOUBLE_TO_SINGLE[0xFF][0x19] = '9';
+ DOUBLE_TO_SINGLE[0xFF][0x1A] = ':';
+ DOUBLE_TO_SINGLE[0xFF][0x1B] = ';';
+ DOUBLE_TO_SINGLE[0xFF][0x1C] = '<';
+ DOUBLE_TO_SINGLE[0xFF][0x1D] = '=';
+ DOUBLE_TO_SINGLE[0xFF][0x1E] = '>';
+ DOUBLE_TO_SINGLE[0xFF][0x1F] = '?';
+ DOUBLE_TO_SINGLE[0xFF][0x20] = '@';
+ DOUBLE_TO_SINGLE[0xFF][0x21] = 'A';
+ DOUBLE_TO_SINGLE[0xFF][0x22] = 'B';
+ DOUBLE_TO_SINGLE[0xFF][0x23] = 'C';
+ DOUBLE_TO_SINGLE[0xFF][0x24] = 'D';
+ DOUBLE_TO_SINGLE[0xFF][0x25] = 'E';
+ DOUBLE_TO_SINGLE[0xFF][0x26] = 'F';
+ DOUBLE_TO_SINGLE[0xFF][0x27] = 'G';
+ DOUBLE_TO_SINGLE[0xFF][0x28] = 'H';
+ DOUBLE_TO_SINGLE[0xFF][0x29] = 'I';
+ DOUBLE_TO_SINGLE[0xFF][0x2A] = 'J';
+ DOUBLE_TO_SINGLE[0xFF][0x2B] = 'K';
+ DOUBLE_TO_SINGLE[0xFF][0x2C] = 'L';
+ DOUBLE_TO_SINGLE[0xFF][0x2D] = 'M';
+ DOUBLE_TO_SINGLE[0xFF][0x2E] = 'N';
+ DOUBLE_TO_SINGLE[0xFF][0x2F] = 'O';
+ DOUBLE_TO_SINGLE[0xFF][0x30] = 'P';
+ DOUBLE_TO_SINGLE[0xFF][0x31] = 'Q';
+ DOUBLE_TO_SINGLE[0xFF][0x32] = 'R';
+ DOUBLE_TO_SINGLE[0xFF][0x33] = 'S';
+ DOUBLE_TO_SINGLE[0xFF][0x34] = 'T';
+ DOUBLE_TO_SINGLE[0xFF][0x35] = 'U';
+ DOUBLE_TO_SINGLE[0xFF][0x36] = 'V';
+ DOUBLE_TO_SINGLE[0xFF][0x37] = 'W';
+ DOUBLE_TO_SINGLE[0xFF][0x38] = 'X';
+ DOUBLE_TO_SINGLE[0xFF][0x39] = 'Y';
+ DOUBLE_TO_SINGLE[0xFF][0x3A] = 'Z';
+ DOUBLE_TO_SINGLE[0xFF][0x3B] = '[';
+ DOUBLE_TO_SINGLE[0xFF][0x3C] = '\\';
+ DOUBLE_TO_SINGLE[0xFF][0x3D] = ']';
+ DOUBLE_TO_SINGLE[0xFF][0x3F] = '_';
+ DOUBLE_TO_SINGLE[0xFF][0x41] = 'a';
+ DOUBLE_TO_SINGLE[0xFF][0x42] = 'b';
+ DOUBLE_TO_SINGLE[0xFF][0x43] = 'c';
+ DOUBLE_TO_SINGLE[0xFF][0x44] = 'd';
+ DOUBLE_TO_SINGLE[0xFF][0x45] = 'e';
+ DOUBLE_TO_SINGLE[0xFF][0x46] = 'f';
+ DOUBLE_TO_SINGLE[0xFF][0x47] = 'g';
+ DOUBLE_TO_SINGLE[0xFF][0x48] = 'h';
+ DOUBLE_TO_SINGLE[0xFF][0x49] = 'i';
+ DOUBLE_TO_SINGLE[0xFF][0x4A] = 'j';
+ DOUBLE_TO_SINGLE[0xFF][0x4B] = 'k';
+ DOUBLE_TO_SINGLE[0xFF][0x4C] = 'l';
+ DOUBLE_TO_SINGLE[0xFF][0x4D] = 'm';
+ DOUBLE_TO_SINGLE[0xFF][0x4E] = 'n';
+ DOUBLE_TO_SINGLE[0xFF][0x4F] = 'o';
+ DOUBLE_TO_SINGLE[0xFF][0x50] = 'p';
+ DOUBLE_TO_SINGLE[0xFF][0x51] = 'q';
+ DOUBLE_TO_SINGLE[0xFF][0x52] = 'r';
+ DOUBLE_TO_SINGLE[0xFF][0x53] = 's';
+ DOUBLE_TO_SINGLE[0xFF][0x54] = 't';
+ DOUBLE_TO_SINGLE[0xFF][0x55] = 'u';
+ DOUBLE_TO_SINGLE[0xFF][0x56] = 'v';
+ DOUBLE_TO_SINGLE[0xFF][0x57] = 'w';
+ DOUBLE_TO_SINGLE[0xFF][0x58] = 'x';
+ DOUBLE_TO_SINGLE[0xFF][0x59] = 'y';
+ DOUBLE_TO_SINGLE[0xFF][0x5A] = 'z';
+ DOUBLE_TO_SINGLE[0xFF][0x5B] = '{';
+ DOUBLE_TO_SINGLE[0xFF][0x5C] = '|';
+ DOUBLE_TO_SINGLE[0xFF][0x5D] = '}';
+ }
+
+ public static char toSingleWidth(char c) {
+ // Mask off high 2 bytes and index into char[][]
+ char[] cBucket = DOUBLE_TO_SINGLE[c >> 8];
+ // If no bucket, then no translation so just use original char
+ if (cBucket == null) {
+ return c;
+ }
+ // Mask off low 2 bytes and index into char[]
+ char cSingle = cBucket[c & 0x00ff];
+ // If char at that index is zero, then no translation so just use
original char
+ if (cSingle == 0) {
+ return c;
+ }
+ return cSingle;
+ }
+
+ /**
+ * Convert double width ascii characters to single width.
+ * This is the equivalent of Oracle's to_single_byte().
+ */
+ public static String toSingleWidth(String value) {
+ int n = value.length();
+ DeferredStringBuilder buf = new DeferredStringBuilder(value);
+
+ for (int i = 0; i < n; i++) {
+ char c = value.charAt(i);
+ buf.append(toSingleWidth(c));
+ }
+ return buf.toString();
+ }
+
+ /**
+ * Returns the sql expression to compute the linguistic sort collation key
for the
+ * given sql expression. This supports sorting in the database, where
sort order
+ * of different upper and lower cases are handled linguistically.
+ */
+ public String getCollationKeySql(String expr, boolean isPostgres) {
+ if (isPostgres) {
+ return "icu_sortkey(" + expr + ",'" + this.locale.toString() +
"')::text";
+ } else {
+ return MessageFormat.format(this.collationKeySql, new Object[] {
expr });
+ }
+ }
+
+ /**
+ * Returns the sql expression to compute the linguistic sort collation key
for the
+ * upper case of given sql expression. This supports case-insensitive
filtering
+ * in the database.
+ */
+ public String getUpperCollationKeySql(String expr, boolean isPostgres) {
+ if (!isPostgres && String.format(upper.getSqlFormatString(), "{0}")
+ .equals(this.collationKeySql)) {
+ return getCollationKeySql(expr, false);
+ }
+ return getCollationKeySql(getUpperCaseSql(expr, isPostgres),
isPostgres);
+ }
+
+ private String formatLetter(String letter, boolean isPostgres) {
+ return getCollationKeySql('\'' + letter + '\'', isPostgres);
+ }
+
+ //
+ // Private Data
+ //
+
+ // TODO: Make this an environment variable.
+ private static final boolean IS_MULTI_LINGUAL = true;
/*(SfdcEnvProvider.getEnv() == null ||
+ SfdcEnvProvider.getEnv().getIniFile().getString("Pages",
"encoding").length() > 0);*/
+
+ static String[] getAlphabetFromICU(Locale locale) {
+ AlphabeticIndex<?> index = new AlphabeticIndex<String>(locale);
+ List<String> alphabet = index.getBucketLabels();
+ if (alphabet.size() > 6) {
+ // Strip off first and last (which are ...)
+ List<String> alphabetWithoutEllipses = alphabet.subList(1,
alphabet.size() - 1);
+ return alphabetWithoutEllipses.toArray(new
String[alphabetWithoutEllipses.size()]);
+ } else {
+ return new String[0];
+ }
+ }
+
+ /**
+ * You can't refer to a static defined inside the same class as an enum,
so you need an
+ * inner class to have such constants
+ * These are the alphabets that cannot be auto-derived from ICU's CLDR
information
+ */
+ static final class Alphabets {
+ static final String[] ENGLISH = { "A", "B", "C", "D", "E", "F", "G",
"H", "I", "J", "K",
+ "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X",
"Y", "Z" };
+ static final String[] CATALAN = { "A", "B", "C", "\u00C7", "D", "E",
"F", "G", "H", "I",
+ "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V",
"W", "X", "Y", "Z" };
+ static final String[] BASQUE = { "A", "B", "C", "\u00C7", "D", "E",
"F", "G", "H", "I",
+ "J", "K", "L", "M", "N", "\u00D1", "O", "P", "Q", "R", "S", "T",
"U", "V", "W", "X",
+ "Y", "Z" };
+ static final String[] JAPANESE = { "A", "B", "C", "D", "E", "F", "G",
"H", "I", "J", "K",
+ "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X",
"Y", "Z", "\u30A2",
+ "\u30AB", "\u30B5", "\u30BF", "\u30CA", "\u30CF", "\u30DE",
"\u30E4", "\u30E9",
+ "\u30EF" };
+
+ // A, B, C, Cs, D, E, F, G, Gy, H, I, J, K, L, Ly, M, N, Ny, O, Ö, P,
Q, R, S, Sz, T,
+ // Ty, U, Ü, V, W, X, Y, Z, Zs
+ static final String[] HUNGARIAN = { "A", "B", "C", "Cs", "D", "E",
"F", "G", "Gy", "H",
+ "I", "J", "K", "L", "Ly", "M", "N", "Ny", "O", "\u00d6", "P", "Q",
"R", "S", "Sz",
+ "T", "Ty", "U", "\u00dc", "V", "W", "X", "Y", "Z", "Zs" };
+
+ static final String[] TURKISH = { "A", "B", "C", "\u00C7", "D", "E",
"F", "G", "\u011E",
+ "H", "I", "\u0130", "J", "K", "L", "M", "N", "O", "\u00D6", "P",
"R", "S", "\u015E",
+ "T", "U", "\u00DC", "V", "Y", "Z" };
+
+ // A, B, C, Ç, D, E, Ə, F, G, Ğ, H, X, I, İ, J, K, Q, L, M, N, O, Ö,
P, R, S, Ş, T,
+ // U, Ü, V, Y, Z
+ static final String[] AZERBAIJANI = { "A", "B", "C", "\u00C7", "D",
"E", "\u018F", "F",
+ "G", "\u011E", "H", "X", "I", "\u0130", "J", "K", "Q", "L", "M",
"N", "O", "\u00D6",
+ "P", "R", "S", "\u015E", "T", "U", "\u00DC", "V", "Y", "Z" };
+
+ // Russian without Ё, Ы, Э
+ static final String[] BULGARIAN = { "\u0410", "\u0411", "\u0412",
"\u0413", "\u0414",
+ "\u0415", "\u0416", "\u0417", "\u0418", "\u0419", "\u041a",
"\u041b", "\u041c",
+ "\u041d", "\u041e", "\u041f", "\u0420", "\u0421", "\u0422",
"\u0423", "\u0424",
+ "\u0425", "\u0426", "\u0427", "\u0428", "\u0429", "\u042a",
"\u042c", "\u042e",
+ "\u042f" };
+
+ // A B C Č Ć D Đ Dž E F G H I J K L Lj M N Nj O P R S Š T U V Z Ž
+ static final String[] SERBIAN_LATIN = { "A", "B", "C", "\u010c",
"\u0106", "D", "\u0110",
+ "D\u017e", "E", "F", "G", "H", "I", "J", "K", "L", "Lj", "M", "N",
"Nj", "O", "P", "R",
+ "S", "\u0160", "T", "U", "V", "Z", "\u017d" };
+
+ // A Á Ä B C Č D Ď DZ DŽ E É F G H CH I Í J K L Ĺ Ľ M N Ň O Ó Ô P Q R
Ŕ S Š T Ť U Ú V W
+ // X Y Ý Z Ž
+ static final String[] SLOVAK = { "A", "\u00c1", "\u00c4", "B", "C",
"\u010c", "D",
+ "\u010e", "DZ", "D\u017d", "E", "\u00c9", "F", "G", "H", "CH",
"I", "\u00cd", "J",
+ "K", "L", "\u0139", "\u013d", "M", "N", "\u0147", "O", "\u00d3",
"\u00d4", "P", "Q",
+ "R", "\u0154", "S", "\u0160", "T", "\u0164", "U", "\u00da", "V",
"W", "X", "Y",
+ "\u00dd", "Z", "\u017d" };
+
+ // ა ბ გ დ ე ვ ზ თ ი კ ლ მ ნ ო პ ჟ რ ს ტ უ ფ ქ ღ .ყ შ ჩ ც ძ წ ჭ ხ ჯ ჰ
+ static final String[] GEORGIAN = { "\u10d0", "\u10d1", "\u10d2",
"\u10d3", "\u10d4",
+ "\u10d5", "\u10d6", "\u10d7", "\u10d8", "\u10d9", "\u10da",
"\u10db", "\u10dc",
+ "\u10dd", "\u10de", "\u10df", "\u10e0", "\u10e1", "\u10e2",
"\u10e3", "\u10e4",
+ "\u10e5", "\u10e6", "\u10e7", "\u10e8", "\u10e9", "\u10ea",
"\u10eb", "\u10ec",
+ "\u10ed", "\u10ee", "\u10ef", "\u10f0" };
+
+ // A B C D E F G H I J K L M N O P Q R S Š Z Ž T U V W Õ Ä Ö Ü X Y
+ static final String[] ESTONIAN = { "A", "B", "C", "D", "E", "F", "G",
"H", "I", "J", "K",
+ "L", "M", "N", "O", "P", "Q", "R", "S", "\u0160", "Z", "\u017d",
"T", "U", "V", "W",
+ "\u00d5", "\u00c4", "\u00d6", "\u00dc", "X", "Y" };
+
+ // A Á B D Ð E É F G H I Í J K L M N O Ó P R S T U Ú V X Y Ý Þ Æ Ö
+ static final String[] ICELANDIC = { "A", "\u00c1", "B", "D", "\u00d0",
"E", "\u00c9", "F",
+ "G", "H", "I", "\u00cd", "J", "K", "L", "M", "N", "O", "\u00d3",
"P", "R", "S", "T",
+ "U", "\u00da", "V", "X", "Y", "\u00dd", "\u00de", "\u00c6",
"\u00d6" };
+
+ // A Ā B C Č D E Ē F G Ģ H I Ī J K Ķ L Ļ M N Ņ O P R S Š T U Ū V Z Ž
+ static final String[] LATVIAN = { "A", "\u0100", "B", "C", "\u010c",
"D", "E", "\u0112",
+ "F", "G", "\u0122", "H", "I", "\u012a", "J", "K", "\u0136", "L",
"\u013b", "M", "N",
+ "\u0145", "O", "P", "R", "S", "\u0160", "T", "U", "\u016a", "V",
"Z", "\u017d" };
+
+ // A \u0104 B C \u010c D E \u0118 \u0116 F G H I \u012e Y J K L M N O
P R S \u0160 T U
+ // \u0172 \u016a V Z \u017d
+ static final String[] LUXEMBOURGISH = { "A", "B", "C", "D", "E", "F",
"G", "H", "I",
+ "J", "K", "L", "M", "N", "O", "P", "R", "S", "T", "U", "V", "W",
"X", "Y", "Z",
+ "Ä", "Ë", "É" };
+
+ // Russian with Ң, Ө, Ү
+ static final String[] KYRGYZ = { "\u0410", "\u0411", "\u0412",
"\u0413", "\u0414",
+ "\u0415", "\u0401", "\u0416", "\u0417", "\u0418", "\u0419",
"\u041a", "\u041b",
+ "\u041c", "\u041d", "\u04a2", "\u041e", "\u04e8", "\u041f",
"\u0420", "\u0421",
+ "\u0422", "\u0423", "\u04ae", "\u0424", "\u0425", "\u0426",
"\u0427", "\u0428",
+ "\u0429", "\u042a", "\u042b", "\u042c", "\u042d", "\u042e",
"\u042f" };
+
+ // Kyrgyz with Ә, Ғ, Ұ, Һ, І (ICU4J doesn't have some of these
characters for sorting...)
+ static final String[] KAZAKH = { "\u0410", "\u04d8", "\u0411",
"\u0412", "\u0413",
+ "\u0492", "\u0414", "\u0415", "\u0401", "\u0416", "\u0417",
"\u0418", "\u0419",
+ "\u041a", "\u049a", "\u041b", "\u041c", "\u041d", "\u04a2",
"\u041e", "\u04e8",
+ "\u041f", "\u0420", "\u0421", "\u0422", "\u0423", "\u04b0",
"\u04ae", "\u0424",
+ "\u0425", "\u04ba", "\u0426", "\u0427", "\u0428", "\u0429",
"\u042a", "\u042b",
+ "\u0406", "\u042c", "\u042d", "\u042e", "\u042f" };
+
+ // Cyrillic Variant
+ static final String[] TAJIK = { "\u0410", "\u0411", "\u0412",
"\u0413", "\u0492", "\u0414",
+ "\u0415", "\u0401", "\u0416", "\u0417", "\u0418", "\u04e2",
"\u0419", "\u041a",
+ "\u049a", "\u041b", "\u041c", "\u041d", "\u041e", "\u041f",
"\u0420", "\u0421",
+ "\u0422", "\u0423", "\u04ee", "\u0424", "\u0425", "\u04b2",
"\u0427", "\u04b6",
+ "\u0428", "\u042a", "\u042d", "\u042e", "\u042f" };
+
+ // اآبپتٹثجچحخدڈذرڑزژسشصضطظعغفقکگلمنوەھ۶ىے
+ static final String[] URDU = new String[] {"\u0627", "\u0622",
"\u0628", "\u067e",
+ "\u062a", "\u0679", "\u062b", "\u062c", "\u0686", "\u062d",
"\u062e", "\u062f",
+ "\u0688", "\u0630", "\u0631", "\u0691", "\u0632", "\u0698",
"\u0633", "\u0634",
+ "\u0635", "\u0636", "\u0637", "\u0638", "\u0639", "\u063a",
"\u0641", "\u0642",
+ "\u06a9", "\u06af", "\u0644", "\u0645", "\u0646", "\u0648",
"\u06d5", "\u06be",
+ "\u06f6", "\u0649", "\u06d2" };
+
+ // W-1308726: removed Ö and Ü; oracle treats them as the same
characters as O and U.
+ // A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, ß, T, U,
V, W, X, Y, Z
+ static final String[] GERMAN = { "A", "B", "C", "D", "E", "F", "G",
"H", "I", "J", "K",
+ "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X",
"Y", "Z" };
+
+ //
ক,খ,গ,ঘ,ঙ,চ,ছ,জ,ঝ,ঞ,ট,ঠ,ড,ঢ,ণ,ত,দ,ধ,ন,প,ফ,ব,ভ,ম,য,র,ল,শ,ষ,স,হ,য়,ড়,ঢ,অ,
+ // আ,ই,ঈ,উ,ঊ,ঋ,ৠ,এ,ঐ,ও,ঔ
+ static final String[] BENGALI = { "\u0995", "\u0996", "\u0997",
"\u0998", "\u0999",
+ "\u099a", "\u099b", "\u099c", "\u099d", "\u099e", "\u099f",
"\u09a0", "\u09a1",
+ "\u09a2", "\u09a3", "\u09a4", "\u09a6", "\u09a7", "\u09a8",
"\u09aa", "\u09ab",
+ "\u09ac", "\u09ad", "\u09ae", "\u09af", "\u09b0", "\u09b2",
"\u09b6", "\u09b7",
+ "\u09b8", "\u09b9", "\u09af\u09bc", "\u09a1\u09bc", "\u09a2",
"\u0985", "\u0986",
+ "\u0987", "\u0988", "\u0989", "\u098a", "\u098b", "\u09e0",
"\u098f", "\u0990",
+ "\u0993", "\u0994" };
+
+ // A, Ą, B, C, Č, D, E, Ę, Ė, F, G, H, I, Į, Y, J, K, L, M, N, O, P,
R, S, Š, T, U, Ų,
+ // Ū, V, Z, Ž
+ static final String[] LITHUANIAN = { "A", "\u0104", "B", "C",
"\u010c", "D", "E", "\u0118",
+ "\u0116", "F", "G", "H", "I", "\u012e", "Y", "J", "K", "L", "M",
"N", "O", "P", "R",
+ "S", "\u0160", "T", "U", "\u0172", "\u016a", "V", "Z", "\u017d" };
+
+ // A, B, C, Č, D, E, F, G, H, I, J, K, L, M, N, O, P, R, S, Š, T, U,
V, Z, Ž
+ static final String[] SLOVENE = { "A", "B", "C", "\u010c", "D", "E",
"F", "G", "H", "I",
+ "J", "K", "L", "M", "N", "O", "P", "R", "S", "\u0160", "T", "U",
"V", "Z", "\u017d" };
+
+ // Contains "TAMIL LETTER"s from
http://www.unicode.org/charts/PDF/U0B80.pdf
+ //அ, ஆ, இ, ஈ, உ, ஊ, எ, ஏ, ஐ, ஒ, ஓ, ஔ, க, ங, ச, ஜ, ஞ,
+ //ட, ண, த, ந, ன, ப, ம, ய, ர, ற, ல, ள, ழ, வ, ஶ, ஷ, ஸ, ஹ
+ static final String[] TAMIL = { "\u0B85", "\u0B86", "\u0B87",
"\u0B88", "\u0B89", "\u0B8A",
+ "\u0B8E", "\u0B8F", "\u0B90", "\u0B92", "\u0B93", "\u0B94",
"\u0B95", "\u0B99",
+ "\u0B9A", "\u0B9C", "\u0B9E", "\u0B9F", "\u0BA3", "\u0BA4",
"\u0BA8", "\u0BA9",
+ "\u0BAA", "\u0BAE", "\u0BAF", "\u0BB0", "\u0BB1", "\u0BB2",
"\u0BB3", "\u0BB4",
+ "\u0BB5", "\u0BB6", "\u0BB7", "\u0BB8", "\u0BB9" };
+
+ static final String STRING = "upper({0})";
+
+ static final String[] JAPANESE_ROLODEX = {
+ // Notes: unistr('\xxxx') is the Oracle sql expression to get
unicode
+ // character by code point.
+ // Two backslashes are converted to one backslash by java compiler.
+ /* 'A' */"unistr('\\3041')",
+ /* 'Ka' */"unistr('\\30F5')",
+ /* 'Sa' */"unistr('\\3055')",
+ /* 'Ta' */"unistr('\\305F')",
+ /* 'Na' */"unistr('\\306A')",
+ /* 'Ha' */"unistr('\\306F')",
+ /* 'Ma' */"unistr('\\307E')",
+ /* 'Ya' */"unistr('\\3084')",
+ /* 'Ra' */"unistr('\\3089')",
+ /* 'Wa' */"unistr('\\308E')", "unistr('\\309D')" };
+
+ // Notes: unistr('\xxxx') is the Oracle sql expression to get unicode
character
+ // by code point. Two backslashes are converted to one backslash by
java compiler.
+ static final String[] JAPANESE_ROLODEX_JAVA = {
+ /* 'A' */"\u3041",
+ /* 'Ka' */"\u30F5",
+ /* 'Sa' */"\u3055",
+ /* 'Ta' */"\u305F",
+ /* 'Na" */"\u306A",
+ /* 'Ha' */"\u306F",
+ /* 'Ma' */"\u307E",
+ /* 'Ya' */"\u3084",
+ /* 'Ra' */"\u3089",
+ /* 'Wa' */"\u308E",
+ "\u3001" // this is the first character after the last valid kana
in java
+ };
+ }
+
+ /**
+ * Apex and possibly other things collate based on upper case versions of
strings.
+ * Always upper casing and then comparing is slow, though, so this method
is intended
+ * to return a collator that is consistent with uppper-case-then-compare
while perhaps
+ * doing something more efficient
+ */
+ public Collator getUpperCaseCollator(final boolean isPostgres) {
+ final Collator innerCollator = getCollator();
+
+ // so far, the best I've been able to do that doesn't break sort order
is to special
+ // case the english locale and scan for non-ascii characters before
deciding how to
+ // proceed. With some work the same basic idea would work in many
other locales but
+ // it would be very nice to find a more general and faster approach.
The challenge
+ // is that upper casing effectively "normalizes" strings in a way that
is very hard
+ // to replicate - for instance, western ligatures tend to get expanded
by upper casing
+ // but Hangul ones don't. Even when that's all sorted out there's the
issue that the
+ // built in collation rules for various locales are fairly narrowly
focused. So, for
+ // instance, the English locale doesn't have rules for sorting Greek.
With a case
+ // insensitive compare in the English locale, lower case Greek letters
sort
+ // differently from upper case Greek letters but the English locale
does upper case
+ // Greek letters.
+ if (!isPostgres && getLocale() == Locale.ENGLISH) {
+ innerCollator.setStrength(Collator.SECONDARY);
+ return new Collator() {
+ @Override
+ public int compare(String source, String target) {
+ // upper case only strings where the SECONDARY strength
comparison
+ // (case insensitive comparison) is possibly different for
upper
+ // cased and non upper cased strings
+ return innerCollator.compare(getUpperCaseIfNeeded(source),
+ getUpperCaseIfNeeded(target));
+ }
+
+ /**
+ * Upper cases on any non-ascii character
+ */
+ private String getUpperCaseIfNeeded(String string) {
+ for (int i = 0; i < string.length(); i++) {
+ final char ch = string.charAt(i);
+ if (ch > 127) {
+ // non-ascii character, bail and use the upper
case version
+ return getUpperCaseValue(string, false);
+ }
+ }
+ // no non-ascii characters found, we don't need to upper
case
+ // - sorting with strength SECONDARY is equivalent.
+ return string;
+ }
+
+ @Override
+ public CollationKey getCollationKey(String source) {
+ return
innerCollator.getCollationKey(getUpperCaseIfNeeded(source));
+ }
+
+ @Override
+ public int hashCode() {
+ return LinguisticSort.this.hashCode();
+ }
+
+ @Override
+ public boolean equals(Object that) {
+ return super.equals(that);
+ }
+ };
+ } else {
+ return new Collator() {
+ @Override
+ public int compare(String source, String target) {
+ return innerCollator.compare(getUpperCaseValue(source,
isPostgres),
+ getUpperCaseValue(target, isPostgres));
+ }
+
+ @Override
+ public CollationKey getCollationKey(String source) {
+ return
innerCollator.getCollationKey(getUpperCaseValue(source, isPostgres));
+ }
+
+ @Override
+ public int hashCode() {
+ return LinguisticSort.this.hashCode();
+ }
+
+ @Override
+ public boolean equals(Object that) {
+ return super.equals(that);
+ }
+ };
+ }
+ }
+}
diff --git
a/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/LocaleUtils.java
b/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/LocaleUtils.java
new file mode 100644
index 0000000000..b07e5b6620
--- /dev/null
+++ b/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/LocaleUtils.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.phoenix.util.i18n;
+
+import java.util.List;
+import java.util.Locale;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+
+import org.apache.phoenix.thirdparty.com.google.common.base.Splitter;
+import org.apache.phoenix.thirdparty.com.google.common.collect.Lists;
+
+/**
+ * This utility class was partially copied from Salesforce's
internationalization utility library
+ * (com.salesforce.i18n:i18n-util:1.0.4), which was released under the
3-clause BSD License.
+ * The i18n-util library is not maintained anymore, and it was using
vulnerable dependencies.
+ * For more info, see: https://issues.apache.org/jira/browse/PHOENIX-6818
+ *
+ * A collection of utilities for dealing with Locales.
+ */
+public enum LocaleUtils {
+ INSTANCE;
+
+ public static LocaleUtils get() {
+ return INSTANCE;
+ }
+
+ // TODO: The number of locales in the system is rather small,
+ // but we should probably use a ConcurrentLruMap just in case.
+ private static final ConcurrentMap<Locale, Locale> UNIQUE_LOCALE_MAP =
+ new ConcurrentHashMap<>(64, .75f, 2);
+
+ /**
+ * Returns a locale for language-only ("en") or language/country ("en_UK")
+ * iso codes
+ */
+ public Locale getLocaleByIsoCode(String isoCode) {
+ if (isoCode == null) {
+ return null;
+ }
+ if (isoCode.length() == 2) {
+ return uniqueifyLocale(new Locale(isoCode));
+ } else if (isoCode.length() == 5) {
+ String countryIsoCode = isoCode.substring(3, 5);
+ String langIsoCode = isoCode.substring(0, 2);
+ return uniqueifyLocale(new Locale(langIsoCode, countryIsoCode));
+ } else {
+ List<String> split =
Lists.newArrayList(Splitter.on('_').split(isoCode));
+ String language = split.get(0);
+ String country = split.size() > 1 ? split.get(1) : "";
+ String variant = split.size() > 2 ? split.get(2) : "";
+ return uniqueifyLocale(new Locale(language, country, variant));
+ }
+ }
+
+ /**
+ * If you're going to cache a locale, it should call this function so that
it caches
+ * @param value the locale to uniquify
+ * @return the unique locale
+ */
+ static Locale uniqueifyLocale(Locale value) {
+ if (value == null) {
+ return null;
+ }
+ Locale oldValue = UNIQUE_LOCALE_MAP.get(value);
+ if (oldValue != null) {
+ return oldValue;
+ }
+ UNIQUE_LOCALE_MAP.put(value, value);
+ return value;
+ }
+}
diff --git
a/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/OracleUpper.java
b/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/OracleUpper.java
new file mode 100644
index 0000000000..128990d180
--- /dev/null
+++ b/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/OracleUpper.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.phoenix.util.i18n;
+
+import org.apache.commons.lang3.StringUtils;
+
+/**
+ * This utility class was partially copied from Salesforce's
internationalization utility library
+ * (com.salesforce.i18n:i18n-util:1.0.4), which was released under the
3-clause BSD License.
+ * The i18n-util library is not maintained anymore, and it was using
vulnerable dependencies.
+ * For more info, see: https://issues.apache.org/jira/browse/PHOENIX-6818
+ *
+ * OracleUpper is used in combination with OracleUpperTable to generate
upper-case output
+ * consistent particular chosen Oracle expressions.
+ *
+ * @see OracleUpperTable
+ */
+public class OracleUpper {
+
+ private OracleUpper() {
+ // HideUtilityClassConstructor
+ }
+
+ /**
+ * Upper-case {@code value}, using the information in {@code t} to produce
a result
+ * consistent with the PL/SQL expression used to generate t.
+ */
+ public static String toUpperCase(OracleUpperTable t, String value) {
+ // Oracle's upper or nls_upper are known to disagree with Java on some
particulars.
+ // We search for known exceptional characters and if found take
measures to adjust
+ // Java's String.toUpperCase. In the average case we incur just a
single relatively
+ // fast scan of the string. In typical bad cases we'll incur two extra
String copies
+ // (one copy into the buffer, one out -- this on top of whatever's
required by
+ // toUpperCase). Note that we have to match Oracle even for characters
outside the
+ // language's alphabet since we still want to return records
containing those characters.
+ char[] exceptions = t.getUpperCaseExceptions();
+ if (exceptions.length > 0) {
+ // Prefer to use String.indexOf in the case of a single search
char; it's faster by
+ // virtue of not requiring two loops and being intrinsic.
+ int nextExceptionIndex = (exceptions.length == 1)
+ ? value.indexOf(exceptions[0]) :
StringUtils.indexOfAny(value, exceptions);
+
+ if (nextExceptionIndex >= 0) {
+ // Annoying case: we have found a character that we know
Oracle handles differently
+ // than Java and we must adjust appropriately.
+ StringBuilder result = new StringBuilder(value.length());
+ String rem = value;
+ do {
+ char nextException = rem.charAt(nextExceptionIndex);
+
+ result.append(rem.substring(0,
nextExceptionIndex).toUpperCase(t.getLocale()));
+
result.append(t.getUpperCaseExceptionMapping(nextException));
+
+ rem = rem.substring(nextExceptionIndex + 1);
+ nextExceptionIndex = (exceptions.length == 1)
+ ? rem.indexOf(exceptions[0]) :
StringUtils.indexOfAny(rem, exceptions);
+ } while (nextExceptionIndex >= 0);
+ result.append(rem.toUpperCase(t.getLocale()));
+
+ return result.toString();
+ }
+ }
+
+ // Nice case: we know of no reason that Oracle and Java wouldn't agree
when converting
+ // to upper case.
+ return value.toUpperCase(t.getLocale());
+ }
+}
diff --git
a/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/OracleUpperTable.java
b/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/OracleUpperTable.java
new file mode 100644
index 0000000000..b453a1bbd5
--- /dev/null
+++
b/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/OracleUpperTable.java
@@ -0,0 +1,337 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.phoenix.util.i18n;
+
+import java.util.Locale;
+
+import edu.umd.cs.findbugs.annotations.SuppressWarnings;
+
+/**
+ * This utility class was partially copied from Salesforce's
internationalization utility library
+ * (com.salesforce.i18n:i18n-util:1.0.4), which was released under the
3-clause BSD License.
+ * The i18n-util library is not maintained anymore, and it was using
vulnerable dependencies.
+ * For more info, see: https://issues.apache.org/jira/browse/PHOENIX-6818
+ *
+ * Generated by i18n.OracleUpperTableGeneratorTest
+ * <p>
+ * An instance of this enum codifies the difference between executing a
+ * {@link #getSqlFormatString() particular PL/SQL expression} in Oracle and
executing
+ * {@link String#toUpperCase(Locale)} for a {@link #getLocale() particular
locale} in Java. These
+ * differences (also called exceptions) are expressed by the output of
+ * {@link #getUpperCaseExceptions()} and {@link
#getUpperCaseExceptionMapping(char)}.
+ * <p>
+ * The tables are generated by testing a particular set of characters that are
known to contain
+ * exceptions and {@link #toUpperCase(String) may be used} to compensate for
exceptions found and
+ * generate output in Java that will be consistent with Oracle for the given
(sql expression,
+ * locale) pair over all tested values.
+ * <p>
+ * Characters tested:
+ * <ul>
+ * <li>U+0069 i</li>
+ * <li>U+00df ß</li>
+ * <li>U+0386 Ά</li>
+ * <li>U+0388 Έ</li>
+ * <li>U+0389 Ή</li>
+ * <li>U+038a Ί</li>
+ * <li>U+038c Ό</li>
+ * <li>U+038e Ύ</li>
+ * <li>U+038f Ώ</li>
+ * <li>U+03ac ά</li>
+ * <li>U+03ad έ</li>
+ * <li>U+03ae ή</li>
+ * <li>U+03af ί</li>
+ * <li>U+03cc ό</li>
+ * <li>U+03cd ύ</li>
+ * <li>U+03ce ώ</li>
+ * </ul>
+ *
+ * @see OracleUpper
+ */
+public enum OracleUpperTable {
+ ENGLISH("upper(%s)", "en", "ß"),
+ GERMAN("nls_upper(%s, 'nls_sort=xgerman')", "de", ""),
+ FRENCH("nls_upper(%s, 'nls_sort=xfrench')", "fr", "ß"),
+ ITALIAN("nls_upper(%s, 'nls_sort=italian')", "it", "ß"),
+ SPANISH("nls_upper(%s, 'nls_sort=spanish')", "es", "ß"),
+ CATALAN("nls_upper(%s, 'nls_sort=catalan')", "ca", "ß"),
+ DUTCH("nls_upper(%s, 'nls_sort=dutch')", "nl", "ß"),
+ PORTUGUESE("nls_upper(%s, 'nls_sort=west_european')", "pt", "ß"),
+ DANISH("nls_upper(%s, 'nls_sort=danish')", "da", "ß"),
+ NORWEGIAN("nls_upper(%s, 'nls_sort=norwegian')", "no", "ß"),
+ SWEDISH("nls_upper(%s, 'nls_sort=swedish')", "sv", "ß"),
+ FINNISH("nls_upper(%s, 'nls_sort=finnish')", "fi", "ß"),
+ CZECH("nls_upper(%s, 'nls_sort=xczech')", "cs", "ß"),
+ POLISH("nls_upper(%s, 'nls_sort=polish')", "pl", "ß"),
+ TURKISH("nls_upper(translate(%s,'i','İ'), 'nls_sort=xturkish')", "tr",
"ß"),
+ CHINESE_HK("nls_upper(to_single_byte(%s), 'nls_sort=tchinese_radical_m')",
"zh", ""),
+ CHINESE_HK_STROKE("nls_upper(to_single_byte(%s),
'nls_sort=tchinese_stroke_m')", "zh", ""),
+ CHINESE_TW("nls_upper(to_single_byte(%s), 'nls_sort=tchinese_radical_m')",
"zh", ""),
+ CHINESE_TW_STROKE("nls_upper(to_single_byte(%s),
'nls_sort=tchinese_stroke_m')", "zh", ""),
+ CHINESE("nls_upper(to_single_byte(%s), 'nls_sort=schinese_radical_m')",
"zh", ""),
+ CHINESE_STROKE("nls_upper(to_single_byte(%s),
'nls_sort=schinese_stroke_m')", "zh", ""),
+ CHINESE_PINYIN("nls_upper(to_single_byte(%s),
'nls_sort=schinese_pinyin_m')", "zh", ""),
+ JAPANESE("nls_upper(to_single_byte(%s), 'nls_sort=japanese_m')", "ja", ""),
+ KOREAN("nls_upper(to_single_byte(%s), 'nls_sort=korean_m')", "ko", ""),
+ RUSSIAN("nls_upper(%s, 'nls_sort=russian')", "ru", "ß"),
+ BULGARIAN("nls_upper(%s, 'nls_sort=bulgarian')", "bg", "ß"),
+ INDONESIAN("nls_upper(%s, 'nls_sort=indonesian')", "in", "ß"),
+ ROMANIAN("nls_upper(%s, 'nls_sort=romanian')", "ro", "ß"),
+ VIETNAMESE("nls_upper(%s, 'nls_sort=vietnamese')", "vi", "ß"),
+ UKRAINIAN("nls_upper(%s, 'nls_sort=ukrainian')", "uk", "ß"),
+ HUNGARIAN("nls_upper(%s, 'nls_sort=xhungarian')", "hu", ""),
+ GREEK("nls_upper(%s, 'nls_sort=greek')", "el", "ßΆΈΉΊΌΎΏάέήίόύώ"),
+ HEBREW("nls_upper(%s, 'nls_sort=hebrew')", "iw", "ß"),
+ SLOVAK("nls_upper(%s, 'nls_sort=slovak')", "sk", "ß"),
+ SERBIAN_CYRILLIC("nls_upper(%s, 'nls_sort=generic_m')", "sr", ""),
+ SERBIAN_LATIN("nls_upper(%s, 'nls_sort=xcroatian')", "sh", "ß"),
+ BOSNIAN("nls_upper(%s, 'nls_sort=xcroatian')", "bs", "ß"),
+ GEORGIAN("nls_upper(%s, 'nls_sort=binary')", "ka", "ß"),
+ BASQUE("nls_upper(%s, 'nls_sort=west_european')", "eu", "ß"),
+ MALTESE("nls_upper(%s, 'nls_sort=west_european')", "mt", "ß"),
+ ROMANSH("nls_upper(%s, 'nls_sort=west_european')", "rm", "ß"),
+ LUXEMBOURGISH("nls_upper(%s, 'nls_sort=west_european')", "lb", "ß"),
+ IRISH("nls_upper(%s, 'nls_sort=west_european')", "ga", "ß"),
+ SLOVENE("nls_upper(%s, 'nls_sort=xslovenian')", "sl", "ß"),
+ CROATIAN("nls_upper(%s, 'nls_sort=xcroatian')", "hr", "ß"),
+ MALAY("nls_upper(%s, 'nls_sort=malay')", "ms", "ß"),
+ ARABIC("nls_upper(%s, 'nls_sort=arabic')", "ar", "ß"),
+ ESTONIAN("nls_upper(%s, 'nls_sort=estonian')", "et", "ß"),
+ ICELANDIC("nls_upper(%s, 'nls_sort=icelandic')", "is", "ß"),
+ LATVIAN("nls_upper(%s, 'nls_sort=latvian')", "lv", "ß"),
+ LITHUANIAN("nls_upper(%s, 'nls_sort=lithuanian')", "lt", "ß"),
+ KYRGYZ("nls_upper(%s, 'nls_sort=binary')", "ky", "ß"),
+ KAZAKH("nls_upper(%s, 'nls_sort=binary')", "kk", "ß"),
+ TAJIK("nls_upper(%s, 'nls_sort=russian')", "tg", "ß"),
+ BELARUSIAN("nls_upper(%s, 'nls_sort=russian')", "be", "ß"),
+ TURKMEN("nls_upper(translate(%s,'i','İ'), 'nls_sort=xturkish')", "tk",
"iß"),
+ AZERBAIJANI("nls_upper(translate(%s,'i','İ'), 'nls_sort=xturkish')", "az",
"ß"),
+ ARMENIAN("nls_upper(%s, 'nls_sort=binary')", "hy", "ß"),
+ THAI("nls_upper(%s, 'nls_sort=thai_dictionary')", "th", "ß"),
+ HINDI("nls_upper(%s, 'nls_sort=binary')", "hi", "ß"),
+ URDU("nls_upper(%s, 'nls_sort=arabic')", "ur", "ß"),
+ BENGALI("nls_upper(%s, 'nls_sort=bengali')", "bn", "ß"),
+ TAMIL("nls_upper(%s, 'nls_sort=binary')", "ta", "ß"),
+ ESPERANTO("upper(%s)", "eo", ""),
+ XWEST_EUROPEAN("NLS_UPPER(%s,'NLS_SORT=xwest_european')", "en", "");
+
+ private final String sql;
+ private final Locale locale;
+ private final char[] exceptionChars;
+
+ OracleUpperTable(String sql, String lang, String exceptionChars) {
+ this.sql = sql;
+ this.locale = new Locale(lang);
+ this.exceptionChars = exceptionChars.toCharArray();
+ }
+
+ /**
+ * Return an array containing characters for which Java's
String.toUpperCase method is known
+ * to deviate from the result of Oracle evaluating {@link #getSql(String)
this expression}.
+ *
+ * @return an array containing all exceptional characters.
+ */
+ final char[] getUpperCaseExceptions() {
+ return exceptionChars;
+ }
+
+ /**
+ * For a character, {@code exception}, contained in the String returned
from
+ * {@link #getUpperCaseExceptions()}, this method returns the anticipated
result of
+ * upper-casing the character in Oracle when evaluating
+ * {@link #getSql(String) this expression}.
+ *
+ * @return the upper case of {@code exception}, according to what Oracle
would do.
+ * @throws IllegalArgumentException
+ * if the character is not contained in the String returned by
+ * {@link #getUpperCaseExceptions()}.
+ */
+ final String getUpperCaseExceptionMapping(char exception) {
+ switch (exception) {
+ case 'i':
+ switch (this) {
+ case TURKMEN: return "İ"; // I
+ default: // fall out
+ }
+ break;
+ case 'ß':
+ switch (this) {
+ case ENGLISH: return "ß"; // SS
+ case FRENCH: return "ß"; // SS
+ case ITALIAN: return "ß"; // SS
+ case SPANISH: return "ß"; // SS
+ case CATALAN: return "ß"; // SS
+ case DUTCH: return "ß"; // SS
+ case PORTUGUESE: return "ß"; // SS
+ case DANISH: return "ß"; // SS
+ case NORWEGIAN: return "ß"; // SS
+ case SWEDISH: return "ß"; // SS
+ case FINNISH: return "ß"; // SS
+ case CZECH: return "ß"; // SS
+ case POLISH: return "ß"; // SS
+ case TURKISH: return "ß"; // SS
+ case RUSSIAN: return "ß"; // SS
+ case BULGARIAN: return "ß"; // SS
+ case INDONESIAN: return "ß"; // SS
+ case ROMANIAN: return "ß"; // SS
+ case VIETNAMESE: return "ß"; // SS
+ case UKRAINIAN: return "ß"; // SS
+ case GREEK: return "ß"; // SS
+ case HEBREW: return "ß"; // SS
+ case SLOVAK: return "ß"; // SS
+ case SERBIAN_LATIN: return "ß"; // SS
+ case BOSNIAN: return "ß"; // SS
+ case GEORGIAN: return "ß"; // SS
+ case BASQUE: return "ß"; // SS
+ case MALTESE: return "ß"; // SS
+ case ROMANSH: return "ß"; // SS
+ case LUXEMBOURGISH: return "ß"; // SS
+ case IRISH: return "ß"; // SS
+ case SLOVENE: return "ß"; // SS
+ case CROATIAN: return "ß"; // SS
+ case MALAY: return "ß"; // SS
+ case ARABIC: return "ß"; // SS
+ case ESTONIAN: return "ß"; // SS
+ case ICELANDIC: return "ß"; // SS
+ case LATVIAN: return "ß"; // SS
+ case LITHUANIAN: return "ß"; // SS
+ case KYRGYZ: return "ß"; // SS
+ case KAZAKH: return "ß"; // SS
+ case TAJIK: return "ß"; // SS
+ case BELARUSIAN: return "ß"; // SS
+ case TURKMEN: return "ß"; // SS
+ case AZERBAIJANI: return "ß"; // SS
+ case ARMENIAN: return "ß"; // SS
+ case THAI: return "ß"; // SS
+ case HINDI: return "ß"; // SS
+ case URDU: return "ß"; // SS
+ case BENGALI: return "ß"; // SS
+ case TAMIL: return "ß"; // SS
+ default: // fall out
+ }
+ break;
+ case 'Ά':
+ switch (this) {
+ case GREEK: return "Α"; // Ά
+ default: // fall out
+ }
+ break;
+ case 'Έ':
+ switch (this) {
+ case GREEK: return "Ε"; // Έ
+ default: // fall out
+ }
+ break;
+ case 'Ή':
+ switch (this) {
+ case GREEK: return "Η"; // Ή
+ default: // fall out
+ }
+ break;
+ case 'Ί':
+ switch (this) {
+ case GREEK: return "Ι"; // Ί
+ default: // fall out
+ }
+ break;
+ case 'Ό':
+ switch (this) {
+ case GREEK: return "Ο"; // Ό
+ default: // fall out
+ }
+ break;
+ case 'Ύ':
+ switch (this) {
+ case GREEK: return "Υ"; // Ύ
+ default: // fall out
+ }
+ break;
+ case 'Ώ':
+ switch (this) {
+ case GREEK: return "Ω"; // Ώ
+ default: // fall out
+ }
+ break;
+ case 'ά':
+ switch (this) {
+ case GREEK: return "Α"; // Ά
+ default: // fall out
+ }
+ break;
+ case 'έ':
+ switch (this) {
+ case GREEK: return "Ε"; // Έ
+ default: // fall out
+ }
+ break;
+ case 'ή':
+ switch (this) {
+ case GREEK: return "Η"; // Ή
+ default: // fall out
+ }
+ break;
+ case 'ί':
+ switch (this) {
+ case GREEK: return "Ι"; // Ί
+ default: // fall out
+ }
+ break;
+ case 'ό':
+ switch (this) {
+ case GREEK: return "Ο"; // Ό
+ default: // fall out
+ }
+ break;
+ case 'ύ':
+ switch (this) {
+ case GREEK: return "Υ"; // Ύ
+ default: // fall out
+ }
+ break;
+ case 'ώ':
+ switch (this) {
+ case GREEK: return "Ω"; // Ώ
+ default: // fall out
+ }
+ break;
+ }
+ throw new IllegalArgumentException(
+ "No upper case mapping for char=" + exception
+ + " and this=" + this);
+ }
+
+ @SuppressWarnings(value = "EI_EXPOSE_REP", justification = "By design.")
+ public final Locale getLocale() {
+ return locale;
+ }
+
+ public String getSqlFormatString() {
+ return sql;
+ }
+
+ public String getSql(String expr) {
+ return String.format(sql, expr);
+ }
+
+ public String toUpperCase(String value) {
+ return OracleUpper.toUpperCase(this, value);
+ }
+
+ public static OracleUpperTable forLinguisticSort(String sort) {
+ return Enum.valueOf(OracleUpperTable.class, sort);
+ }
+}
+
diff --git
a/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/package-info.java
b/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/package-info.java
new file mode 100644
index 0000000000..3878a7c082
--- /dev/null
+++ b/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/package-info.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ * This package contains utility classes partially copied from Salesforce's
+ * internationalization utility library (com.salesforce.i18n:i18n-util:1.0.4),
which was
+ * released under the 3-clause BSD License.
+ *
+ * The i18n-util library is not maintained anymore, and it was using
vulnerable dependencies.
+ * For more info, see: https://issues.apache.org/jira/browse/PHOENIX-6818
+ */
+package org.apache.phoenix.util.i18n;
diff --git
a/phoenix-core/src/test/java/org/apache/phoenix/util/i18n/LinguisticSortTest.java
b/phoenix-core/src/test/java/org/apache/phoenix/util/i18n/LinguisticSortTest.java
new file mode 100644
index 0000000000..7603b4d5b7
--- /dev/null
+++
b/phoenix-core/src/test/java/org/apache/phoenix/util/i18n/LinguisticSortTest.java
@@ -0,0 +1,650 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.phoenix.util.i18n;
+
+import static org.apache.phoenix.util.i18n.LinguisticSort.AZERBAIJANI;
+import static org.apache.phoenix.util.i18n.LinguisticSort.BASQUE;
+import static org.apache.phoenix.util.i18n.LinguisticSort.BENGALI;
+import static org.apache.phoenix.util.i18n.LinguisticSort.BOSNIAN;
+import static org.apache.phoenix.util.i18n.LinguisticSort.BULGARIAN;
+import static org.apache.phoenix.util.i18n.LinguisticSort.CATALAN;
+import static org.apache.phoenix.util.i18n.LinguisticSort.CHINESE_HK;
+import static org.apache.phoenix.util.i18n.LinguisticSort.CHINESE_HK_STROKE;
+import static org.apache.phoenix.util.i18n.LinguisticSort.CHINESE_TW;
+import static org.apache.phoenix.util.i18n.LinguisticSort.CHINESE_TW_STROKE;
+import static org.apache.phoenix.util.i18n.LinguisticSort.CROATIAN;
+import static org.apache.phoenix.util.i18n.LinguisticSort.ESTONIAN;
+import static org.apache.phoenix.util.i18n.LinguisticSort.FINNISH;
+import static org.apache.phoenix.util.i18n.LinguisticSort.HUNGARIAN;
+import static org.apache.phoenix.util.i18n.LinguisticSort.ICELANDIC;
+import static org.apache.phoenix.util.i18n.LinguisticSort.JAPANESE;
+import static org.apache.phoenix.util.i18n.LinguisticSort.KOREAN;
+import static org.apache.phoenix.util.i18n.LinguisticSort.LATVIAN;
+import static org.apache.phoenix.util.i18n.LinguisticSort.LITHUANIAN;
+import static org.apache.phoenix.util.i18n.LinguisticSort.ROMANIAN;
+import static org.apache.phoenix.util.i18n.LinguisticSort.SERBIAN_LATIN;
+import static org.apache.phoenix.util.i18n.LinguisticSort.SLOVAK;
+import static org.apache.phoenix.util.i18n.LinguisticSort.SLOVENE;
+import static org.apache.phoenix.util.i18n.LinguisticSort.TAJIK;
+import static org.apache.phoenix.util.i18n.LinguisticSort.TURKISH;
+import static org.apache.phoenix.util.i18n.LinguisticSort.TURKMEN;
+import static org.apache.phoenix.util.i18n.LinguisticSort.VIETNAMESE;
+import static org.apache.phoenix.util.i18n.LinguisticSort.LUXEMBOURGISH;
+import static org.apache.phoenix.util.i18n.LinguisticSort.URDU;
+import static org.apache.phoenix.util.i18n.LinguisticSort.TAMIL;
+import static org.apache.phoenix.util.i18n.LinguisticSort.ESPERANTO;
+
+import com.ibm.icu.text.Normalizer2;
+
+import java.text.CollationKey;
+import java.text.Collator;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.EnumSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Random;
+import java.util.Set;
+
+import org.apache.phoenix.thirdparty.com.google.common.collect.ImmutableList;
+import org.apache.phoenix.thirdparty.com.google.common.collect.Ordering;
+
+import junit.framework.TestCase;
+
+/**
+ * This test class was partially copied from Salesforce's internationalization
utility library
+ * (com.salesforce.i18n:i18n-util:1.0.4), which was released under the
3-clause BSD License.
+ * The i18n-util library is not maintained anymore, and it was using
vulnerable dependencies.
+ * For more info, see: https://issues.apache.org/jira/browse/PHOENIX-6818
+ *
+ * This could be expanded significantly.
+ */
+public class LinguisticSortTest extends TestCase {
+
+ public LinguisticSortTest(String name) {
+ super(name);
+ }
+
+ public void testThaiBasicSorting() {
+ Locale thaiLoc = new Locale("th");
+
+ LinguisticSort thaiSort = LinguisticSort.get(thaiLoc);
+
+ // basic sanity check on thai collator comparisons
+ ImmutableList<String> unsorted =
+ ImmutableList.of("azw", "Ac", "ab", "21", "zaa", "b\u0e40k",
"bk");
+ ImmutableList<String> sorted =
+ ImmutableList.of("21", "ab", "Ac", "azw", "bk", "b\u0e40k",
"zaa");
+
+ assertEquals(sorted,
+
Ordering.from(thaiSort.getNonCachingComparator()).sortedCopy(unsorted));
+ assertEquals(sorted,
+
Ordering.from(thaiSort.getComparator(16)).sortedCopy(unsorted));
+ }
+
+ public void testThaiCharactersOfDeath() {
+ // This is the original bug report
+ Collator c = Collator.getInstance(new Locale("th"));
+ String s = "\u0e40";
+ // any one of \u0e40, \u0e41, \u0e42, \u0e43, or \u0e44 will do
+ System.out.println(c.compare(s, s)); // In JDK6: runs forever
+
+
+ // Here's the "real" test
+ Locale thaiLoc = new Locale("th");
+
+ LinguisticSort thaiSort = LinguisticSort.get(thaiLoc);
+ Collator thaiColl = thaiSort.getCollator();
+
+ String [] oomStrings = {
+ "\u0e3f", "\u0e45", "\u0e40k", "\u0e44", "\u0e43", "\u0e42",
"\u0e41", "\u0e40"
+ };
+ String [] srcStrings = oomStrings;
+ // Deprecated Patched collator adds space after problematic characters
at end of string
+ // (because of http://bugs.sun.com/view_bug.do?bug_id=5047314)
+ // Otherwise unpatched collator would OOM on these strings
+ // String [] srcStrings = {
+ // "\u0e3f", "\u0e45", "\u0e40k", "\u0e44 ", "\u0e43 ", "\u0e42 ",
"\u0e41 ", "\u0e40 "
+ // };
+
+ for (int i=0; i<oomStrings.length;i++) {
+ String oomString = oomStrings[i];
+ CollationKey key = thaiColl.getCollationKey(oomString);
+ assertEquals("string #"+i, srcStrings[i], key.getSourceString());
+ }
+ }
+
+ public void testRolodexIndexByChar() throws Exception{
+ LinguisticSort englishSort = LinguisticSort.ENGLISH;
+
+ assertEquals(0, englishSort.getRolodexIndexForChar("a"));
+ assertEquals(0, englishSort.getRolodexIndexForChar("Á"));
+ assertEquals(1, englishSort.getRolodexIndexForChar("b"));
+ assertEquals(13, englishSort.getRolodexIndexForChar("N"));
+ assertEquals(13, englishSort.getRolodexIndexForChar("Ñ"));
+ assertEquals(25, englishSort.getRolodexIndexForChar("z"));
+ //А below is the Cyrillic А
+ assertOther(Arrays.asList("А", "こ"), englishSort);
+
+ //Spanish
+ LinguisticSort spanishSort = LinguisticSort.SPANISH;
+ assertEquals(0, spanishSort.getRolodexIndexForChar("a"));
+ assertEquals(0, spanishSort.getRolodexIndexForChar("Á"));
+ assertEquals(1, spanishSort.getRolodexIndexForChar("b"));
+ assertEquals(13, spanishSort.getRolodexIndexForChar("N"));
+ assertEquals(14, spanishSort.getRolodexIndexForChar("Ñ"));
+ assertEquals(26, spanishSort.getRolodexIndexForChar("z"));
+ //А below is the Cyrillic А
+ assertOther(Arrays.asList("А", "こ"), spanishSort);
+
+ //Japanese
+ LinguisticSort japaneseSort = LinguisticSort.JAPANESE;
+ assertEquals(0, japaneseSort.getRolodexIndexForChar("a"));
+ assertEquals(0, japaneseSort.getRolodexIndexForChar("Á"));
+ assertEquals(1, japaneseSort.getRolodexIndexForChar("b"));
+ assertEquals(13, japaneseSort.getRolodexIndexForChar("N"));
+ assertEquals(13, japaneseSort.getRolodexIndexForChar("Ñ"));
+ assertEquals(25, japaneseSort.getRolodexIndexForChar("z"));
+ assertEquals(27, japaneseSort.getRolodexIndexForChar("こ"));
+ assertEquals(27, japaneseSort.getRolodexIndexForChar("く"));
+ assertEquals(31, japaneseSort.getRolodexIndexForChar("ふ"));
+ //А below is the Cyrillic А
+ assertOther(Arrays.asList("\u0410"), spanishSort); // А
+
+ //Malay has a rolodex
+ LinguisticSort malaySort = LinguisticSort.MALAY;
+ assertEquals(0, malaySort.getRolodexIndexForChar("a"));
+ assertEquals(25, malaySort.getRolodexIndexForChar("z"));
+ assertOther(Arrays.asList("\u0410", "\u304f"), malaySort); // "А", "く"
+
+ // Thai has a rolodex, all of these should be "other"
+ // (Thai has 44 chars, so other is 46)
+ LinguisticSort thaiSort = LinguisticSort.THAI;
+ assertConstant(Arrays.asList("A", "Á", "b", "\u304f", "\u0410"),
+ thaiSort, 46, "had a rolodex index.");
+
+ }
+
+ public void testRolodexComparedToIcu() {
+ Set<LinguisticSort> knownDifferences = EnumSet.of(
+ CATALAN, FINNISH, TURKISH, CHINESE_HK, CHINESE_HK_STROKE,
CHINESE_TW,
+ CHINESE_TW_STROKE, JAPANESE, KOREAN, BULGARIAN, ROMANIAN,
VIETNAMESE,
+ HUNGARIAN, SLOVAK, SERBIAN_LATIN, BOSNIAN, BASQUE,
LUXEMBOURGISH, SLOVENE,
+ CROATIAN, ESTONIAN, ICELANDIC, LATVIAN, LITHUANIAN, TAJIK,
TURKMEN, AZERBAIJANI,
+ URDU, BENGALI, TAMIL, ESPERANTO);
+
+ for (LinguisticSort sort : LinguisticSort.values()) {
+ if (knownDifferences.contains(sort)) {
+ continue;
+ }
+
+ String[] alphabet = sort.getAlphabet();
+ String[] icuAlphabet =
LinguisticSort.getAlphabetFromICU(sort.getLocale());
+ String alphaAsString = Arrays.toString(alphabet);
+ String icuAlphaAsString = Arrays.toString(icuAlphabet);
+
+ assertEquals("LinguisticSort for " + sort + " doesn't match",
+ icuAlphaAsString, alphaAsString);
+ if (!icuAlphaAsString.equals(alphaAsString)) {
+ System.out.println(sort + "\n" + icuAlphaAsString + "\n" +
alphaAsString);
+ } else {
+ //System.out.println(sort + ":SAME");
+ }
+ }
+ }
+
+ private void assertOther(Collection<String> chars, LinguisticSort sort){
+ assertConstant(chars, sort, sort.getAlphabetLength(), "wasn't in
'Other' category");
+ }
+
+ private void assertConstant(Collection<String> chars, LinguisticSort sort,
+ int constant, String message) {
+ for (String c : chars){
+ assertEquals(c + " " + message, constant,
sort.getRolodexIndexForChar(c));
+ }
+ }
+
+ /**
+ * Make sure the upper case collator works equivalently to upper-casing
then collating
+ */
+ public void testUpperCaseCollator() {
+ // bump these up for performance testing
+ final int repeatTimes = 1;
+ final int testSize = 1000;
+
+ testUpperCaseCollator(true, repeatTimes, testSize);
+ testUpperCaseCollator(false, repeatTimes, testSize);
+ }
+
+ /**
+ * Implementation of the testUpperCaseCollator that allows breaking out an
ascii only
+ * test from a general string test
+ */
+ private void testUpperCaseCollator(boolean asciiOnly, int repeatTimes, int
testSize) {
+ final LinguisticSort sort = LinguisticSort.ENGLISH;
+ final Collator collator = sort.getCollator();
+
+ final Collator ucCollator = sort.getUpperCaseCollator(false);
+
+ final Random r = new Random();
+ final int maxLength = 100;
+ for (int iteration = 0; iteration < repeatTimes; iteration++) {
+ final boolean lastTime = iteration == repeatTimes - 1;
+ final String[] originals = new String[testSize];
+ for (int i = 0; i < testSize; i++) {
+ switch (i) {
+ case 0:
+ originals[i] = "abß";
+ break;
+ case 1:
+ originals[i] = "abSS";
+ break;
+ case 2:
+ originals[i] = "abß";
+ break;
+ case 3:
+ originals[i] = "ffo";
+ break;
+ case 4:
+ originals[i] = "ffi";
+ break;
+ case 5:
+ originals[i] = "FFI";
+ break;
+ case 6:
+ originals[i] = "fred";
+ break;
+ case 7:
+ originals[i] = "FRED";
+ break;
+ case 8:
+ originals[i] = "FREE";
+ break;
+ case 9:
+ originals[i] = "剫";
+ break;
+ case 10:
+ originals[i] = "뻎";
+ break;
+ case 11:
+ originals[i] = "\u1fe3";
+ break;
+ case 12:
+ originals[i] = "\u05d7";
+ break;
+ case 13:
+ originals[i] = "\u1fd3";
+ break;
+ case 14:
+ originals[i] = "\u1441";
+ break;
+ case 15:
+ originals[i] = "\ub9fe";
+ break;
+ case 16:
+ originals[i] = "\u0398";
+ break;
+ case 17:
+ originals[i] = "\u0399";
+ break;
+ case 18:
+ originals[i] = "\u039a";
+ break;
+ case 19:
+ originals[i] = "\u4371";
+ break;
+ case 20:
+ originals[i] = "\ufb06";
+ break;
+ default :
+ originals[i] = randomString(r, maxLength, asciiOnly);
+ }
+ }
+
+ final int[] upperResults = new int[testSize];
+ {
+ final long start = System.currentTimeMillis();
+ for (int i = 0; i < testSize; i++) {
+ final int next = i + 1 == testSize ? 0 : i + 1;
+ upperResults[i] =
collator.compare(sort.getUpperCaseValue(originals[i], false),
+ sort.getUpperCaseValue(originals[next], false));
+ }
+ if (lastTime) {
+ final long time = System.currentTimeMillis() - start;
+ System.out.println("Compared " + testSize + " " +
(asciiOnly ? "ascii " : "") +
+ "strings with upper casing in " + time + "ms");
+ }
+ }
+
+ final int[] caseResults = new int[testSize];
+ {
+ final long start = System.currentTimeMillis();
+ for (int i = 0; i < testSize; i++) {
+ final int next = i + 1 == testSize ? 0 : i + 1;
+ caseResults[i] = ucCollator.compare(originals[i],
originals[next]);
+ }
+ if (lastTime) {
+ final long time = System.currentTimeMillis() - start;
+ System.out.println("Compared " + testSize + " " +
(asciiOnly ? "ascii " : "") +
+ "strings with upper case collator comparison in "
+ time + "ms");
+ }
+ }
+
+ final int[] keyResults = new int[testSize];
+ {
+ final long start = System.currentTimeMillis();
+ for (int i = 0; i < testSize; i++) {
+ final int next = i + 1 == testSize ? 0 : i + 1;
+ keyResults[i] = ucCollator.getCollationKey(originals[i])
+
.compareTo(ucCollator.getCollationKey(originals[next]));
+ }
+ if (lastTime) {
+ final long time = System.currentTimeMillis() - start;
+ System.out.println("Compared " + testSize + " " +
(asciiOnly ? "ascii " : "") +
+ "strings with collation keys in " + time + "ms");
+ }
+ }
+
+ if (lastTime) {
+ System.out.println();
+ }
+
+ if (lastTime) {
+ // normalizing helps see why strings don't compare the same
when upper-cased
+ final Normalizer2 normalizer = Normalizer2.getNFKDInstance();
+ for (int i = 0; i < testSize; i++) {
+ final int next = i + 1 == testSize ? 0 : i + 1;
+ final boolean caseOk = upperResults[i] == caseResults[i];
+ final boolean keyOk = upperResults[i] == keyResults[i];
+ if (!caseOk || !keyOk) {
+ final String message =
+ "Did not get expected result when comparing
string " + i + " " +
+ (caseOk ? "" : "using upper case collator
comparison ") +
+ (caseOk || keyOk ? "" : "or ") +
+ (keyOk ? "" : "using collation key comparison
") +
+ "\n" +
+ "'" + escape(originals[i]) + "'\n" +
+ "(" +
escape(sort.getUpperCaseValue(originals[i], false)) + ")\n" +
+ "<" +
escape(normalizer.normalize(originals[i])) + "> " +
+ "with string " + next + " \n" +
+ "'" + escape(originals[next]) + "'\n" +
+ "(" +
escape(sort.getUpperCaseValue(originals[next], false)) +
+ ")\n " +
+ "<" +
escape(normalizer.normalize(originals[next])) + ">";
+ assertEquals(message, upperResults[i], caseResults[i]);
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * For diagnosis of mismatched strings, dumps a string using standard Java
notation
+ * for escaping non-printable or non-ascii characters
+ */
+ private String escape(String string) {
+ final StringBuilder sb = new StringBuilder(string.length() * 2);
+ int index = 0;
+ while (index < string.length()) {
+ final int ch = string.codePointAt(index);
+ index += Character.charCount(ch);
+
+ escapeCodePoint(sb, ch);
+ }
+ return sb.toString();
+ }
+
+ /**
+ * Escapes a single code point so that non-ascii and non-printable
characters use
+ * their standard Java escape
+ */
+ private void escapeCodePoint(final StringBuilder sb, final int ch) {
+ switch(ch) {
+ case '\b' : sb.append("\\b");
+ break;
+ case '\t' : sb.append("\\t");
+ break;
+ case '\n' : sb.append("\\n");
+ break;
+ case '\r' : sb.append("\\r");
+ break;
+ case '\f' : sb.append("\\f");
+ break;
+ case '\"' : sb.append("\\\"");
+ break;
+ case '\\' : sb.append("\\\\");
+ break;
+ default:
+ if (ch < 0x20 || ch > 0x7E) {
+ sb.append(String.format("\\u%04x", ch));
+ } else {
+ sb.appendCodePoint(ch);
+ }
+ }
+ }
+
+ /**
+ * Generates a random string with between 0 and maxLength characters
+ */
+ private String randomString(Random r, int maxLength, boolean asciiOnly) {
+ final int length = r.nextInt(maxLength);
+ return randomFixedLengthString(r, length, asciiOnly);
+ }
+
+
+ /**
+ * Generates a random string of the given length
+ */
+ private String randomFixedLengthString(Random r, int length, boolean
asciiOnly) {
+ final StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < length; i++) {
+ char c = 0;
+ while (!Character.isDefined(c) || Character.isISOControl(c)) {
+ c = (char)(asciiOnly ? r.nextInt(128) : r.nextInt());
+ }
+ sb.append(c);
+ }
+ return sb.toString();
+ }
+
+ public void testUpperCaseExceptionChars() {
+ // Sharp s in English
+ String[][] enCases = new String[][] {
+ // { input, expected output }
+ new String[] { "ß", "ß" },
+ new String[] { "ßß", "ßß" },
+ new String[] { "ßßß", "ßßß" },
+ new String[] { "aß", "Aß" },
+ new String[] { "aaaß", "AAAß" },
+ new String[] { "ßa", "ßA" },
+ new String[] { "ßaaa", "ßAAA" },
+ new String[] { "aßb", "AßB" },
+ new String[] { "aaaßbbb", "AAAßBBB" },
+ new String[] { "ßaß", "ßAß" },
+ new String[] { "ßaaaß", "ßAAAß" },
+ new String[] { "aßbßc", "AßBßC" },
+ new String[] { "aaaßbbbßccc", "AAAßBBBßCCC" },
+ new String[] { "aßßc", "AßßC" },
+ new String[] { "aaaßßccc", "AAAßßCCC" },
+ };
+
+ for (String[] c : enCases) {
+ assertEquals(c[1], LinguisticSort.ENGLISH.getUpperCaseValue(c[0],
false));
+ }
+
+ // Omicron in Greek
+ String[][] greekCases = new String[][] {
+ new String[] { "\u039f", "\u039f" }, // capital omicron
+ new String[] { "Ό", "\u039f" }
+
+ };
+
+ for (String[] c : greekCases) {
+ assertEquals(c[1], LinguisticSort.GREEK.getUpperCaseValue(c[0],
false));
+ }
+ }
+
+ public void testUsesUpper() {
+ assertTrue(LinguisticSort.ENGLISH.usesUpperToGetUpperCase(false));
+ assertTrue(LinguisticSort.ESPERANTO.usesUpperToGetUpperCase(false));
+ assertTrue(!LinguisticSort.GERMAN.usesUpperToGetUpperCase(false));
+ }
+
+ public void testGetUpperCaseCollationKey() {
+ assertEquals(LinguisticSort.ENGLISH.getUpperCaseSql("x", false),
+ LinguisticSort.ENGLISH.getUpperCollationKeySql("x", false));
+ }
+
+ /**
+ * I wanted to see the perf impact of doing special-case logic in the EN
locale for the German
+ * sharp s, ß. Rename this test (remove the leading _) to run it,
e.g. in Eclipse.
+ * <p>
+ * This method generates two sets of 1000 randomish Strings, one with
sharp s and one without.
+ * Then it runs 1 million uppercase operations on each bank of strings,
using the EN locale
+ * (with the special-case logic) and a test locale -- EO, Esperanto --
which does not have
+ * any special-case logic.
+ * <p>
+ * For posterity, when I run this on my machine, I see results like this
+ * (averages rounded to nearest 10ms):
+ * <p>
+ * <table>
+ * <tr><td></td><td>ENGLSIH</td><td>ESPERANTO</td><td>GREEK</td></tr>
+ * <tr><td>with sharp s</td><td>330ms</td><td>260ms</td><td>370ms</td></tr>
+ * <tr><td>without sharp
s</td><td>150ms</td><td>130ms</td><td>213ms</td></tr>
+ * </table>
+ */
+ public void _testUpperCasePerf() {
+ String[] withSharpS = genStrings(1000, true);
+ String[] withoutSharpS = genStrings(1000, false);
+
+ System.out.println("ENGLISH, with ß:");
+ runUpperCase(LinguisticSort.ENGLISH, withSharpS);
+ System.out.println("ENGLISH, without ß:");
+ runUpperCase(LinguisticSort.ENGLISH, withoutSharpS);
+
+ System.out.println("ESPERANTO, with ß:");
+ runUpperCase(LinguisticSort.ESPERANTO, withSharpS);
+ System.out.println("ESPERANTO, without ß:");
+ runUpperCase(LinguisticSort.ESPERANTO, withoutSharpS);
+
+ // Interesting for having a lot of exceptions.
+ System.out.println("GREEK, with ß:");
+ runUpperCase(LinguisticSort.GREEK, withSharpS);
+ System.out.println("GREEK, without ß:");
+ runUpperCase(LinguisticSort.GREEK, withoutSharpS);
+ }
+
+ private void runUpperCase(LinguisticSort sort, String[] inputs) {
+ // Warm up
+ for (int i = 0; i < 10000; i++) {
+ sort.getUpperCaseValue(inputs[i % inputs.length], false);
+ }
+
+ // Run experiment
+ for (int i = 0; i < 3; i++) {
+ long start = System.currentTimeMillis();
+ for (int j = 0; j < 1000000; j++) {
+ sort.getUpperCaseValue(inputs[j % inputs.length], false);
+ }
+
+ System.out.println("[" + (i + 1) + "] Complete in " +
+ (System.currentTimeMillis() - start) + "ms.");
+ }
+ }
+
+ /**
+ * Return n randomly generated strings, each containing at least
+ * one sharp s if useSharpS is true.
+ * */
+ private String[] genStrings(int n, boolean useSharpS) {
+ Random r = new Random();
+
+ String[] inputs = new String[n];
+ for (int i = 0; i < inputs.length; i++) {
+ inputs[i] = randomString(r, r.nextInt(12) + 1, r.nextBoolean())
+ + (useSharpS? "ß" : "")
+ + (r.nextBoolean() ?
+ randomString(r, r.nextInt(12) + 1, r.nextBoolean()) +
(useSharpS? "ß" : "")
+ : "")
+ + (randomString(r, r.nextInt(12) + 1, r.nextBoolean()));
+
+ if (!useSharpS) assertFalse(inputs[i].contains("ß"));
+ }
+ return inputs;
+ }
+
+ private List<String> cloneAndSort(LinguisticSort sort, List<String>
source) {
+ List<String> result = new ArrayList<String>(source);
+ Collections.sort(result, sort.getCollator());
+ return result;
+ }
+
+ /**
+ * Validate that the sorting of the linguistic sorts for various locales
is "correct"
+ * The toSort below is in this order.
+ * 阿嗄阾啊 : āáǎa
+ * 仈㶚 : bā bà
+ * 齑: ji
+ */
+ public void testChineseSorting() {
+ final List<String> toSort = ImmutableList.of("\u963f", "\u55c4",
"\u963e",
+ "\u554a", "\u4ec8", "\u3d9a", "\u9f51");
+ assertEquals(ImmutableList.of("\u4ec8", "\u554a", "\u55c4", "\u3d9a",
"\u963e",
+ "\u963f", "\u9f51"), cloneAndSort(LinguisticSort.CHINESE,
toSort));
+ assertEquals(ImmutableList.of("\u4ec8", "\u554a", "\u55c4", "\u3d9a",
"\u963e",
+ "\u963f", "\u9f51"), cloneAndSort(LinguisticSort.CHINESE_HK,
toSort));
+ assertEquals(ImmutableList.of("\u4ec8", "\u554a", "\u55c4", "\u3d9a",
"\u963e",
+ "\u963f", "\u9f51"), cloneAndSort(LinguisticSort.CHINESE_TW,
toSort));
+ assertEquals(ImmutableList.of("\u4ec8", "\u963e", "\u963f", "\u554a",
"\u55c4",
+ "\u9f51", "\u3d9a"),
cloneAndSort(LinguisticSort.CHINESE_STROKE, toSort));
+ assertEquals(ImmutableList.of("\u4ec8", "\u963e", "\u963f", "\u554a",
"\u55c4",
+ "\u9f51", "\u3d9a"),
cloneAndSort(LinguisticSort.CHINESE_HK_STROKE, toSort));
+ assertEquals(ImmutableList.of("\u4ec8", "\u963e", "\u963f", "\u554a",
"\u55c4",
+ "\u9f51", "\u3d9a"),
cloneAndSort(LinguisticSort.CHINESE_TW_STROKE, toSort));
+ assertEquals(ImmutableList.of("\u963f", "\u55c4", "\u554a", "\u4ec8",
"\u9f51",
+ "\u963e", "\u3d9a"),
cloneAndSort(LinguisticSort.CHINESE_PINYIN, toSort));
+ }
+
+ public void testChineseLocaleMapping() {
+ assertEquals(LinguisticSort.CHINESE,
+ LinguisticSort.get(new Locale("zh")));
+ assertEquals(LinguisticSort.CHINESE_TW,
+ LinguisticSort.get(new Locale("zh","TW")));
+ assertEquals(LinguisticSort.CHINESE,
+ LinguisticSort.get(new Locale("zh","SG")));
+ assertEquals(LinguisticSort.CHINESE_HK,
+ LinguisticSort.get(new Locale("zh","HK")));
+ assertEquals(LinguisticSort.CHINESE_TW_STROKE,
+ LinguisticSort.get(new Locale("zh","TW","STROKE")));
+ assertEquals(LinguisticSort.CHINESE_HK_STROKE,
+ LinguisticSort.get(new Locale("zh","HK","STROKE")));
+ assertEquals(LinguisticSort.CHINESE_STROKE,
+ LinguisticSort.get(new Locale("zh","CN","STROKE")));
+ assertEquals(LinguisticSort.CHINESE_STROKE,
+ LinguisticSort.get(new Locale("zh","SG","STROKE")));
+ assertEquals(LinguisticSort.CHINESE_STROKE,
+ LinguisticSort.get(new Locale("zh","","STROKE")));
+ assertEquals(LinguisticSort.CHINESE_PINYIN,
+ LinguisticSort.get(new Locale("zh","CN","PINYIN")));
+ assertEquals(LinguisticSort.CHINESE_PINYIN,
+ LinguisticSort.get(new Locale("zh","SG","PINYIN")));
+ assertEquals(LinguisticSort.CHINESE_PINYIN,
+ LinguisticSort.get(new Locale("zh","","PINYIN")));
+ }
+}
diff --git
a/phoenix-core/src/test/java/org/apache/phoenix/util/i18n/OracleUpperTableGeneratorTest.java
b/phoenix-core/src/test/java/org/apache/phoenix/util/i18n/OracleUpperTableGeneratorTest.java
new file mode 100644
index 0000000000..2e101cf78d
--- /dev/null
+++
b/phoenix-core/src/test/java/org/apache/phoenix/util/i18n/OracleUpperTableGeneratorTest.java
@@ -0,0 +1,391 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.phoenix.util.i18n;
+
+import junit.framework.TestCase;
+
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.util.Locale;
+
+/**
+ * This test class was partially copied from Salesforce's internationalization
utility library
+ * (com.salesforce.i18n:i18n-util:1.0.4), which was released under the
3-clause BSD License.
+ * The i18n-util library is not maintained anymore, and it was using
vulnerable dependencies.
+ * For more info, see: https://issues.apache.org/jira/browse/PHOENIX-6818
+ *
+ * A generator for OracleUpperTable.java. This generator creates an
OracleUpperTable for each
+ * of a number of {@link UpperExpr PL/SQL expressions}, which simply tabulates
the these
+ * differences, allowing them to be compensated for.
+ * <p>
+ * May be run as a JUnit test or as a stand-alone Java application. Run the
output in Oracle
+ * to generate the source for OracleUpperTable.java.
+ *
+ * @see OracleUpper
+ * @see OracleUpperTable
+ */
+public class OracleUpperTableGeneratorTest extends TestCase {
+
+ private static final char[] charsToTest = new char[] {
+ // i may be messed up for Turkic languages where it's supposed to
upper-case
+ // to dotted I.
+ 'i',
+ // Sharp s may upper-case to SS or itself, depending on the
details.
+ 'ß',
+ // Oracle removes tonos from all of these when upper-casing.
+ 'Ά', 'Έ', 'Ή', 'Ί', 'Ό', 'Ύ','Ώ','ά','έ','ή','ί','ό','ύ','ώ'
+ };
+
+ /**
+ * Most of these were just generated from the LinguisticSort enum:
+ *
+ * <pre><code>
+ * public static void generateValuesFromLinguisticSort() {
+ * for (LinguisticSort s : LinguisticSort.values()) {
+ * System.out.println(String.format("%1$s(\"%2$s\",
\"%3$s\"),",
+ * s.name(), s.getUpperSqlFormatString(),
s.getLocale().getLanguage()));
+ * }
+ * }
+ * </code></pre>
+ *
+ * Each value is a PL/SQL upper case expression that may return different
results than
+ * Java's String.toUpperCase method for the given language.
+ */
+ private enum UpperExpr {
+ ENGLISH("upper(%s)", "en"),
+ GERMAN("nls_upper(%s, 'nls_sort=xgerman')", "de"),
+ FRENCH("nls_upper(%s, 'nls_sort=xfrench')", "fr"),
+ ITALIAN("nls_upper(%s, 'nls_sort=italian')", "it"),
+ SPANISH("nls_upper(%s, 'nls_sort=spanish')", "es"),
+ CATALAN("nls_upper(%s, 'nls_sort=catalan')", "ca"),
+ DUTCH("nls_upper(%s, 'nls_sort=dutch')", "nl"),
+ PORTUGUESE("nls_upper(%s, 'nls_sort=west_european')", "pt"),
+ DANISH("nls_upper(%s, 'nls_sort=danish')", "da"),
+ NORWEGIAN("nls_upper(%s, 'nls_sort=norwegian')", "no"),
+ SWEDISH("nls_upper(%s, 'nls_sort=swedish')", "sv"),
+ FINNISH("nls_upper(%s, 'nls_sort=finnish')", "fi"),
+ CZECH("nls_upper(%s, 'nls_sort=xczech')", "cs"),
+ POLISH("nls_upper(%s, 'nls_sort=polish')", "pl"),
+ TURKISH("nls_upper(translate(%s,'i','İ'), 'nls_sort=xturkish')", "tr"),
+ CHINESE_HK("nls_upper(to_single_byte(%s),
'nls_sort=tchinese_radical_m')", "zh"),
+ CHINESE_TW("nls_upper(to_single_byte(%s),
'nls_sort=tchinese_radical_m')", "zh"),
+ CHINESE("nls_upper(to_single_byte(%s),
'nls_sort=schinese_radical_m')", "zh"),
+ JAPANESE("nls_upper(to_single_byte(%s), 'nls_sort=japanese_m')", "ja"),
+ KOREAN("nls_upper(to_single_byte(%s), 'nls_sort=korean_m')", "ko"),
+ RUSSIAN("nls_upper(%s, 'nls_sort=russian')", "ru"),
+ BULGARIAN("nls_upper(%s, 'nls_sort=bulgarian')", "bg"),
+ INDONESIAN("nls_upper(%s, 'nls_sort=indonesian')", "in"),
+ ROMANIAN("nls_upper(%s, 'nls_sort=romanian')", "ro"),
+ VIETNAMESE("nls_upper(%s, 'nls_sort=vietnamese')", "vi"),
+ UKRAINIAN("nls_upper(%s, 'nls_sort=ukrainian')", "uk"),
+ HUNGARIAN("nls_upper(%s, 'nls_sort=xhungarian')", "hu"),
+ GREEK("nls_upper(%s, 'nls_sort=greek')", "el"),
+ HEBREW("nls_upper(%s, 'nls_sort=hebrew')", "iw"),
+ SLOVAK("nls_upper(%s, 'nls_sort=slovak')", "sk"),
+ SERBIAN_CYRILLIC("nls_upper(%s, 'nls_sort=generic_m')", "sr"),
+ SERBIAN_LATIN("nls_upper(%s, 'nls_sort=xcroatian')", "sh"),
+ BOSNIAN("nls_upper(%s, 'nls_sort=xcroatian')", "bs"),
+ GEORGIAN("nls_upper(%s, 'nls_sort=binary')", "ka"),
+ BASQUE("nls_upper(%s, 'nls_sort=west_european')", "eu"),
+ MALTESE("nls_upper(%s, 'nls_sort=west_european')", "mt"),
+ ROMANSH("nls_upper(%s, 'nls_sort=west_european')", "rm"),
+ LUXEMBOURGISH("nls_upper(%s, 'nls_sort=west_european')", "lb"),
+ IRISH("nls_upper(%s, 'nls_sort=west_european')", "ga"),
+ SLOVENE("nls_upper(%s, 'nls_sort=xslovenian')", "sl"),
+ CROATIAN("nls_upper(%s, 'nls_sort=xcroatian')", "hr"),
+ MALAY("nls_upper(%s, 'nls_sort=malay')", "ms"),
+ ARABIC("nls_upper(%s, 'nls_sort=arabic')", "ar"),
+ ESTONIAN("nls_upper(%s, 'nls_sort=estonian')", "et"),
+ ICELANDIC("nls_upper(%s, 'nls_sort=icelandic')", "is"),
+ LATVIAN("nls_upper(%s, 'nls_sort=latvian')", "lv"),
+ LITHUANIAN("nls_upper(%s, 'nls_sort=lithuanian')", "lt"),
+ KYRGYZ("nls_upper(%s, 'nls_sort=binary')", "ky"),
+ KAZAKH("nls_upper(%s, 'nls_sort=binary')", "kk"),
+ TAJIK("nls_upper(%s, 'nls_sort=russian')", "tg"),
+ BELARUSIAN("nls_upper(%s, 'nls_sort=russian')", "be"),
+ TURKMEN("nls_upper(translate(%s,'i','İ'), 'nls_sort=xturkish')", "tk"),
+ AZERBAIJANI("nls_upper(translate(%s,'i','İ'), 'nls_sort=xturkish')",
"az"),
+ ARMENIAN("nls_upper(%s, 'nls_sort=binary')", "hy"),
+ THAI("nls_upper(%s, 'nls_sort=thai_dictionary')", "th"),
+ HINDI("nls_upper(%s, 'nls_sort=binary')", "hi"),
+ URDU("nls_upper(%s, 'nls_sort=arabic')", "ur"),
+ BENGALI("nls_upper(%s, 'nls_sort=bengali')", "bn"),
+ TAMIL("nls_upper(%s, 'nls_sort=binary')", "ta"),
+ ESPERANTO("upper(%s)", "eo"),
+
+ // for formulas
+ XWEST_EUROPEAN("NLS_UPPER(%s,'NLS_SORT=xwest_european')", "en");
+
+
+ private final String expr;
+ private final Locale locale;
+
+ /**
+ * @param expr the PL/SQL expression with %s wildcards for the single
string input.
+ * @param langCode ISO code for the language to use, as in
+ * <code> str.toUpperCase(new Locale(langCode))<code>.
+ */
+ private UpperExpr(String expr, String langCode) {
+ this.expr = expr;
+ this.locale = new Locale(langCode);
+ }
+
+ private String getSql(char value) {
+ return String.format(expr, "unistr('\\" + hexCodePoint(value) +
"')");
+ }
+
+ private String getJava(char value) {
+ return Character.toString(value).toUpperCase(locale);
+ }
+ }
+
+ /**
+ * This method generates some anonymous PL/SQL routines which, when run,
will generate an
+ * OracleUpperTable value for each {@code UpperExpr}. Each table is
created by comparing
+ * the result of {@link String#toUpperCase(Locale)} against a
+ * {@link UpperExpr#getSql(char) PL/SQL expression}. The table contains
all deviations from
+ * Oracle for each character in a {@link #charsToTest given set} that we
know are fussy.
+ */
+ public static void generateUpperCaseExceptions(PrintWriter out) {
+
+ out.println("set serveroutput on;");
+ out.println("set define off;"); // So we don't have to escape
ampersands.
+ out.println("/");
+ out.println("BEGIN");
+
+ putLine(out, "/*");
+ putLine(out, " * Licensed to the Apache Software Foundation (ASF)
under one or more");
+ putLine(out, " * contributor license agreements. See the NOTICE file
distributed with");
+ putLine(out, " * this work for additional information regarding
copyright ownership.");
+ putLine(out, " * The ASF licenses this file to you under the Apache
License, Version 2.0");
+ putLine(out, " * (the \"License\"); you may not use this file except
in compliance with");
+ putLine(out, " * the License. You may obtain a copy of the License
at");
+ putLine(out, " *");
+ putLine(out, " * http://www.apache.org/licenses/LICENSE-2.0");
+ putLine(out, " *");
+ putLine(out, " * Unless required by applicable law or agreed to in
writing, software");
+ putLine(out, " * distributed under the License is distributed on an
\"AS IS\" BASIS,");
+ putLine(out, " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
express or implied.");
+ putLine(out, " * See the License for the specific language governing
permissions and");
+ putLine(out, " * limitations under the License.");
+ putLine(out, " */");
+
+ putLine(out, "package i18n;");
+ putLine(out, "");
+ putLine(out, "import java.util.Locale;");
+ putLine(out, "import edu.umd.cs.findbugs.annotations.NonNull;");
+ putLine(out, "");
+ putLine(out, "/**");
+ putLine(out, " * Generated by " +
OracleUpperTableGeneratorTest.class.getCanonicalName());
+ putLine(out, " * <p>");
+ putLine(out, " * An instance of this enum codifies the difference
between executing a " +
+ "{@link #getSqlFormatString() particular PL/SQL");
+ putLine(out, " * expression} in Oracle and executing {@link
String#toUpperCase(Locale)} " +
+ "for a {@link #getLocale() particular locale}");
+ putLine(out, " * in Java. These differences (also called exceptions)
are expressed by " +
+ "the output of {@link #getUpperCaseExceptions()}");
+ putLine(out, " * and {@link #getUpperCaseExceptionMapping(char)}.");
+ putLine(out, " * <p>");
+ putLine(out, " * The tables are generated by testing a particular set
of characters " +
+ "that are known to contain exceptions and");
+ putLine(out, " * {@link #toUpperCase(String) may be used} to
compensate for exceptions " +
+ "found and generate output in Java that will be");
+ putLine(out, " * consistent with Oracle for the given (sql expression,
locale) pair " +
+ "over all tested values.");
+ putLine(out, " * <p>");
+ putLine(out, " * Characters tested:");
+ putLine(out, " * <ul>");
+ for (char c : charsToTest) {
+ putLine(out, " * <li>U+%1$s &#x%1$s</li>", hexCodePoint(c));
+ }
+ putLine(out, " * </ul>");
+ putLine(out, " *");
+ putLine(out, " * @see OracleUpper");
+ putLine(out, " */");
+ putLine(out, "public enum OracleUpperTable {");
+
+ for (UpperExpr u : UpperExpr.values()) {
+ put(out, " %s(\"%s\", \"%s\", \"", u.name(), u.expr,
u.locale.getLanguage());
+
+ // Don't generate any exceptions for EO, it's a test value and
+ // I wanna use it as a baseline.
+ if (u != UpperExpr.ESPERANTO) {
+ for (char c : charsToTest) {
+ String template = "IF %1$s <> '%2$s' THEN
dbms_output.put(unistr('\\%3$s')); END IF;";
+ out.println(String.format(template, u.getSql(c),
u.getJava(c), hexCodePoint(c)));
+ }
+ }
+
+ putLine(out, "\"),");
+ }
+
+ putLine(out, " ;");
+ putLine(out, "");
+ putLine(out, " private final String sql;");
+ putLine(out, " private final Locale locale;");
+ putLine(out, " private final char[] exceptionChars;");
+ putLine(out, "");
+ putLine(out, " private OracleUpperTable(String sql, String lang, " +
+ "String exceptionChars) {");
+ putLine(out, " this.sql = sql;");
+ putLine(out, " this.locale = new Locale(lang);");
+ putLine(out, " this.exceptionChars =
exceptionChars.toCharArray();");
+ putLine(out, " }");
+ putLine(out, "");
+ putLine(out, " /**");
+ putLine(out, " * Return an array containing characters for which
Java's " +
+ "String.toUpperCase method is known to");
+ putLine(out, " * deviate from the result of Oracle evaluating
{@link #getSql(String) " +
+ "this expression}.");
+ putLine(out, " *");
+ putLine(out, " * @return an array containing all exceptional
characters.");
+ putLine(out, " */");
+ putLine(out, " final @NonNull char[] getUpperCaseExceptions() {");
+ putLine(out, " return exceptionChars;");
+ putLine(out, " }");
+ putLine(out, "");
+ putLine(out, " /**");
+ putLine(out, " * For a character, {@code exception}, contained in
the String " +
+ "returned from");
+ putLine(out, " * {@link #getUpperCaseExceptions()}, this method
returns the " +
+ "anticipated result of upper-casing");
+ putLine(out, " * the character in Oracle when evaluating {@link
#getSql(String) " +
+ "this expression}.");
+ putLine(out, " *");
+ putLine(out, " * @return the upper case of {@code exception},
according to what " +
+ "Oracle would do.");
+ putLine(out, " * @throws IllegalArgumentException");
+ putLine(out, " * if the character is not contained in
the String returned");
+ putLine(out, " * by {@link
#getUpperCaseExceptions()}.");
+ putLine(out, " */");
+ putLine(out, " final String getUpperCaseExceptionMapping(char
exception) {");
+
+ putLine(out, " switch (exception) {");
+ for (char c : charsToTest){
+ putLine(out, " case '%s':", "" + c);
+ putLine(out, " switch (this) {");
+ for (UpperExpr u : UpperExpr.values()) {
+ if (u == UpperExpr.ESPERANTO) {
+ continue;
+ }
+ String template = "IF %1$s <> '%2$s' THEN
dbms_output.put_line(' " +
+ "case %3$s: return ' || '\"' || %1$s || '\"; //
%2$s'); END IF;";
+ out.println(String.format(template,
+ u.getSql(c),
+ u.getJava(c),
+ u.name()));
+ }
+ putLine(out, " default: // fall out");
+ putLine(out, " }");
+ putLine(out, " break;");
+ }
+ putLine(out, " }");
+
+ putLine(out, " throw new IllegalArgumentException(");
+ putLine(out, " \"No upper case mapping for char=\" +
exception");
+ putLine(out, " + \" and this=\" + this);");
+ putLine(out, " }");
+ putLine(out, "");
+
+ putLine(out, " public final Locale getLocale() {");
+ putLine(out, " return locale;");
+ putLine(out, " }");
+ putLine(out, "");
+
+ putLine(out, " public String getSqlFormatString() {");
+ putLine(out, " return sql;");
+ putLine(out, " }");
+ putLine(out, "");
+
+ putLine(out, " public String getSql(String expr) {");
+ putLine(out, " return String.format(sql, expr);");
+ putLine(out, " }");
+ putLine(out, "");
+
+ putLine(out, " public String toUpperCase(String value) {");
+ putLine(out, " return OracleUpper.toUpperCase(this, value);");
+ putLine(out, " }");
+ putLine(out, "");
+
+ putLine(out, " public static final OracleUpperTable
forLinguisticSort(String sort) {");
+ putLine(out, " return Enum.valueOf(OracleUpperTable.class,
sort);");
+ putLine(out, " }");
+ putLine(out, "}");
+
+ out.println("END;");
+ }
+
+ /** Escape single quotes by doubling them up (i.e. two single quotes in a
row). */
+ private static String sqlEscape(String str) {
+ //return TextUtil.replaceChar(str, '\'', "''");
+ return str.replace("'", "''");
+ }
+
+ /** Return four hex digits of the character's codepoint. */
+ private static String hexCodePoint(char c) {
+ String cp = Integer.toHexString(c);
+ while (cp.length() < 4) {
+ cp = "0" + cp;
+ }
+ return cp;
+ }
+
+ /** Send to standard output a dbms_output.put_line call that will emit the
result of
+ * {@link String#format(String, Object...) formatting} {@code str} with
{@code args}.
+ *
+ * @param str a format string
+ * @param args optional format arguments.
+ */
+ private static void put(PrintWriter out, String str, String... args) {
+ out.println("dbms_output.put('" + format(str, args) + "');");
+ }
+
+ /** Send to standard output a dbms_output.put call that will emit the
result of
+ * {@link #format(String, String...) formatting} {@code str} with {@code
args}.
+ *
+ * @param str a format string
+ * @param args optional format arguments.
+ */
+ private static void putLine(PrintWriter out, String str, String... args) {
+ out.println("dbms_output.put_line('" + format(str, args) + "');");
+ }
+
+ /**
+ * Both {@code str} and {@code args} will be {@link #sqlEscape(String)
sql escaped},
+ * and then {@code str} will be {@link String#format(String, Object...)
formatted}
+ * using {@code args}.
+ */
+ private static String format(String str, String... args) {
+ str = sqlEscape(str);
+ if (args != null && args.length > 0) {
+ for (int i = 0; i < args.length; i++) {
+ args[i] = sqlEscape(args[i]);
+ }
+ str = String.format(str, (Object[])args);
+ }
+ return str;
+ }
+
+ public static void main(String[] args) {
+ generateUpperCaseExceptions(new PrintWriter(System.out));
+ }
+
+ public void testGenerateUpperCaseExceptions() {
+ // Don't bother logging it, just see if there's an exception
+ generateUpperCaseExceptions(new PrintWriter(new StringWriter()));
+ }
+}
diff --git a/pom.xml b/pom.xml
index fa419ae31f..89548d9da5 100644
--- a/pom.xml
+++ b/pom.xml
@@ -124,7 +124,7 @@
<tephra.version>0.16.1</tephra.version>
<omid.version>1.0.2</omid.version>
<stream.version>2.9.5</stream.version>
- <i18n-util.version>1.0.4</i18n-util.version>
+ <icu4j.version>72.1</icu4j.version>
<guice.version>4.0</guice.version>
<zookeeper.version>3.4.14</zookeeper.version>
<curator.version>4.0.0</curator.version>
@@ -1443,9 +1443,14 @@
<version>${stream.version}</version>
</dependency>
<dependency>
- <groupId>com.salesforce.i18n</groupId>
- <artifactId>i18n-util</artifactId>
- <version>${i18n-util.version}</version>
+ <groupId>com.ibm.icu</groupId>
+ <artifactId>icu4j</artifactId>
+ <version>${icu4j.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.ibm.icu</groupId>
+ <artifactId>icu4j-localespi</artifactId>
+ <version>${icu4j.version}</version>
</dependency>
<dependency>
<groupId>com.lmax</groupId>