Very high level: UTF-16 is not expected to be a popular encoding for text outside the JDK. Everyone is supposed to be migrating to UTF-8 from ISO-8859-1 and other legacy encodings.
The fact that people (like you and I) are writing specialized encoders/decoders outside of the "real" charset implementations for better performance suggests that the nio charset API could be rethought. On Sun, Jul 2, 2017 at 12:22 PM, John Platts <john_pla...@hotmail.com> wrote: > I was looking at the OpenJDK 9 code, and I noticed that optimizations for > encoding and decoding from UTF-16 text could be added to the > java.lang.StringCoding class. > > Here is how the optimized UTF-16 decoding could be implemented in > java.lang.StringCoding: > private static void byteSwapUTF16(byte[] arr, int start) { > for(int i = start; i < arr.length; i += 2) { > byte b1 = arr[i]; > byte b2 = arr[i + 1]; > > > arr[i] = b2; > arr[i + 1] = b1; > } > } > > static byte[] encodeUTF16BE(byte coder, byte[] val, boolean includeBOM) { > byte[] result; > > if(coder == LATIN1) { > result = new byte[(val.length + (includeBOM ? 1 : 0)) << 1]; > int resultStartOffset = includeBOM ? 2 : 0; > > if(includeBOM) { > result[0] = (byte)0xFE; > result[1] = (byte)0xFF; > } > > for(int i = 0; i < val.length; i++) { > result[resultStartOffset + (i << 1) + 1] = val[i]; > } > } else { > result = new byte[val.length + (includeBOM ? 2 : 0)]; > int resultStartOffset = includeBOM ? 2 : 0; > > if(includeBOM) { > result[0] = (byte)0xFE; > result[1] = (byte)0xFF; > } > > System.arraycopy(val, 0, result, resultStartOffset, val.length); > > if(StringUTF16.HI_BYTE_SHIFT == 0) { > // val is encoded using little-endian UTF-16 > // Convert to big-endian UTF-16 from little-endian UTF-16 > byteSwapUTF16(result, resultStartOffset); > } > > for(int i = resultStartOffset; i < result.length; i += 2) { > int b1 = Byte.toUnsignedInt(result[i]); > int b3 = result.length - i >= 4 ? Byte.toUnsignedInt(result[i > + 2]) : -1; > if(b1 >= 0xD8 && b1 <= 0xDF) { > if(b1 <= 0xDB && b3 >= 0xDC && b3 <= 0xDF) { > // UTF-16 surrogate pair encountered > > // Advance i to the position of the low surrogate > i += 2; > > // Continue the loop past the low surrogate > continue; > } > > // Unpaired surrogate character encountered > // Replace unpaired surrogate character with U+FFFD > result[i] = (byte)0xFF; > result[i + 1] = (byte)0xFD; > } > } > } > > return result; > } > > static byte[] encodeUTF16LE(byte coder, byte[] val) { > byte[] result; > > if(coder == LATIN1) { > result = new byte[val.length << 1]; > > for(int i = 0; i < val.length; i++) { > result[i << 1] = val[i]; > } > } else { > result = val.clone(); > > if(StringUTF16.LO_BYTE_SHIFT == 0) { > // val is encoded using big-endian UTF-16 > > // Convert result to little-endian UTF-16 from big-endian > UTF-16 by byte swapping > byteSwapUTF16(result, 0); > } > > for(int i = 0; i < result.length; i += 2) { > int b2 = Byte.toUnsignedInt(result[i + 1]); > int b4 = result.length - i >= 4 ? Byte.toUnsignedInt(result[i > + 3]) : -1; > if(b2 >= 0xD8 && b2 <= 0xDF) { > if(b2 <= 0xDB && b4 >= 0xDC && b4 <= 0xDF) { > // UTF-16 surrogate pair encountered > > // Advance i to the position of the low surrogate > i += 2; > > // Continue the loop past the low surrogate > continue; > } > > // Unpaired surrogate character encountered > // Replace unpaired surrogate character with U+FFFD > result[i] = (byte)0xFD; > result[i + 1] = (byte)0xFF; > } > } > } > > return result; > } > > static Result bomDetectDecodeUTF16(byte[] ba, int off, int len) { > boolean bigEndian = true; > > if(len >= 2) { > int b1 = Byte.toUnsignedInt(ba[off]); > int b2 = Byte.toUnsignedInt(ba[off + 1]); > if(b1 == 0xFE && b2 == 0xFF) { > // Big-endian BOM detected > off += 2; > len -= 2; > } else if(b1 == 0xFF && b2 == 0xFE) { > // Little-endian BOM detected > off += 2; > len -= 2; > bigEndian = false; > } > } > > return decodeUTF16(ba, off, len, bigEndian); > } > > > static Result decodeUTF16(byte[] ba, int off, int len, boolean bigEndian) { > Result result = new Result(); > > if(len == 0) { > return result.with(); > } > > byte[] decodedArr; > if(COMPACT_STRINGS && (len & 1) == 0) { > // Check for non-Latin1 characters > boolean containsNonLatin1 = false; > for(int i = 0; i < len; i += 2) { > if(ba[off + i + (bigEndian ? 0 : 1)] != 0) { > containsNonLatin1 = true; > break; > } > } > > // If the input only contains Latin1 characters, copy the source > characters > // to a Latin1-encoded byte array, and return the decoded text. > if(!containsNonLatin1) { > decodedArr = new byte[len >> 1]; > > for(int i = 0; i < decodedArr.length; i++) { > decodedArr[i] = ba[off + (i << 1) + (bigEndian ? 1 : 0)]; > } > > return result.with(decodedArr, LATIN1); > } > } > > decodedArr = new byte[len + (len & 1)]; > System.arraycopy(ba, off, decodedArr, 0, len); > > if(StringUTF16.HI_BYTE_SHIFT != (bigEndian ? 8 : 0)) { > // Input byte order does not match system byte order > > // Byte swap decodedArr so that decodedArr is in system byte order > byteSwapUTF16(decodedArr, 0); > } > > // decodedArr is now in system byte order > > if((len & 1) != 0) { > // If len is odd, then there is a malformed character at the end. > > // Replace the last character in decodedArr with U+FFFD if this is > the case. > StringUTF16.putChar(decodedArr, (decodedArr.length >> 1) - 1, > 0xFFFD); > > // Decrement len by 1 to make len even. > len--; > } > > // len is now even > > // charLen is equal to the number of UTF-16 characters in decodedArr > int charLen = len >> 1; > > // replace the reversed BOM and unpaired surrogates with U+FFFD > for(int i = 0; i < charLen; i++) { > char ch = StringUTF16.getChar(decodedArr, i); > > if(charLen - i >= 2 && > Character.isSurrogatePair(ch, StringUTF16.getChar(decodedArr, > i + 1)) { > // Surrogate pair detected > > // Increment i to the position of the low surrogate > i++; > > // Continue the loop > continue; > } > > if(ch == (char)0xFFFE || Character.isSurrogate(ch)) { > // Reversed BOM or unpaired surrogate encountered > > // Replace ch with 0xFFFD > StringUTF16.putChar(decodedArr, i, (char)0xFFFD); > } > } > > // If compact strings are enabled, return a Latin1-encoded result if > the result > // does not contain any non-Latin-1 characters. > if(COMPACT_STRINGS) { > byte[] compressedArr = StringUTF16.compress(decodedArr, 0, > decodedArr.len); > if(compressedArr != null) { > return result.with(compressedArr, LATIN1); > } > } > > return result.with(decodedArr, UTF16); > } > > private static class StringDecoderUTF_16 extends StringDecoder { > StringDecoderUTF_16(Charset cs, String rcn) { > super(cs, rcn); > } > Result decode(byte[] ba, int off, int len) { > return bomDetectDecodeUTF16(ba, off, len); > } > } > > private static class StringDecoderUTF_16LE extends StringDecoder { > StringDecoderUTF_16(Charset cs, String rcn) { > super(cs, rcn); > } > Result decode(byte[] ba, int off, int len) { > return decodeUTF16(ba, off, len, false); > } > } > > private static class StringDecoderUTF_16BE extends StringDecoder { > StringDecoderUTF_16(Charset cs, String rcn) { > super(cs, rcn); > } > Result decode(byte[] ba, int off, int len) { > return decodeUTF16(ba, off, len, true); > } > } > > static Result decode(String charsetName, byte[] ba, int off, int len) > throws UnsupportedEncodingException > { > StringDecoder sd = deref(decoder); > String csn = (charsetName == null) ? "ISO-8859-1" : charsetName; > if ((sd == null) || !(csn.equals(sd.requestedCharsetName()) > || csn.equals(sd.charsetName()))) { > sd = null; > try { > Charset cs = lookupCharset(csn); > if (cs != null) { > if (cs == UTF_8) { > sd = new StringDecoderUTF8(cs, csn); > } else if (cs == ISO_8859_1) { > sd = new StringDecoder8859_1(cs, csn); > } else if(cs == StandardCharsets.UTF_16) { > sd = new StringDecoderUTF_16(cs, csn); > } else if(cs == StandardCharsets.UTF_16LE) { > sd = new StringDecoderUTF_16LE(cs, csn); > } else if(cs == StandardCharsets.UTF_16BE) { > sd = new StringDecoderUTF_16BE(cs, csn); > } else { > sd = new StringDecoder(cs, csn); > } > } > } catch (IllegalCharsetNameException x) {} > if (sd == null) > throw new UnsupportedEncodingException(csn); > set(decoder, sd); > } > return sd.decode(ba, off, len); > } > } > > static byte[] encode(Charset cs, byte coder, byte[] val) { > if (cs == UTF_8) { > return encodeUTF8(coder, val); > } else if (cs == ISO_8859_1) { > return encode8859_1(coder, val); > } else if (cs == US_ASCII) { > return encodeASCII(coder, val); > } else if (cs == StandardCharsets.UTF_16 || cs == > StandardCharsets.UTF_16BE){ > return encodeUTF16BE(coder, val, cs == > StandardCharsets.UTF_16); > } else if (cs == StandardCharsets.UTF_16LE) { > return encodeUTF16LE(coder, val); > } > CharsetEncoder ce = cs.newEncoder(); > // fastpath for ascii compatible > if (coder == LATIN1 && (((ce instanceof ArrayEncoder) && > ((ArrayEncoder)ce).isASCIICompatible() && > !hasNegatives(val, 0, val.length)))) { > return Arrays.copyOf(val, val.length); > } > int len = val.length >> coder; // assume LATIN1=0/UTF16=1; > int en = scale(len, ce.maxBytesPerChar()); > byte[] ba = new byte[en]; > if (len == 0) { > return ba; > } > boolean isTrusted = System.getSecurityManager() == null || > cs.getClass().getClassLoader0() == null; > ce.onMalformedInput(CodingErrorAction.REPLACE) > .onUnmappableCharacter(CodingErrorAction.REPLACE) > .reset(); > if (ce instanceof ArrayEncoder) { > if (!isTrusted) { > val = Arrays.copyOf(val, val.length); > } > int blen = (coder == LATIN1 ) ? > ((ArrayEncoder)ce).encodeFromLatin1(val, > 0, len, ba) > : > ((ArrayEncoder)ce).encodeFromUTF16(val, > 0, len, ba); > if (blen != -1) { > return safeTrim(ba, blen, isTrusted); > } > } > char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) > : StringUTF16.toChars(val); > ByteBuffer bb = ByteBuffer.wrap(ba); > CharBuffer cb = CharBuffer.wrap(ca, 0, len); > try { > CoderResult cr = ce.encode(cb, bb, true); > if (!cr.isUnderflow()) > cr.throwException(); > cr = ce.flush(bb); > if (!cr.isUnderflow()) > cr.throwException(); > } catch (CharacterCodingException x) { > throw new Error(x); > } > return safeTrim(ba, bb.position(), isTrusted); > }