I was looking at the OpenJDK 9 code, and I noticed that optimizations for encoding and decoding from UTF-16 text could be added to the java.lang.StringCoding class.
Here is how the optimized UTF-16 decoding could be implemented in java.lang.StringCoding: private static void byteSwapUTF16(byte[] arr, int start) { for(int i = start; i < arr.length; i += 2) { byte b1 = arr[i]; byte b2 = arr[i + 1]; arr[i] = b2; arr[i + 1] = b1; } } static byte[] encodeUTF16BE(byte coder, byte[] val, boolean includeBOM) { byte[] result; if(coder == LATIN1) { result = new byte[(val.length + (includeBOM ? 1 : 0)) << 1]; int resultStartOffset = includeBOM ? 2 : 0; if(includeBOM) { result[0] = (byte)0xFE; result[1] = (byte)0xFF; } for(int i = 0; i < val.length; i++) { result[resultStartOffset + (i << 1) + 1] = val[i]; } } else { result = new byte[val.length + (includeBOM ? 2 : 0)]; int resultStartOffset = includeBOM ? 2 : 0; if(includeBOM) { result[0] = (byte)0xFE; result[1] = (byte)0xFF; } System.arraycopy(val, 0, result, resultStartOffset, val.length); if(StringUTF16.HI_BYTE_SHIFT == 0) { // val is encoded using little-endian UTF-16 // Convert to big-endian UTF-16 from little-endian UTF-16 byteSwapUTF16(result, resultStartOffset); } for(int i = resultStartOffset; i < result.length; i += 2) { int b1 = Byte.toUnsignedInt(result[i]); int b3 = result.length - i >= 4 ? Byte.toUnsignedInt(result[i + 2]) : -1; if(b1 >= 0xD8 && b1 <= 0xDF) { if(b1 <= 0xDB && b3 >= 0xDC && b3 <= 0xDF) { // UTF-16 surrogate pair encountered // Advance i to the position of the low surrogate i += 2; // Continue the loop past the low surrogate continue; } // Unpaired surrogate character encountered // Replace unpaired surrogate character with U+FFFD result[i] = (byte)0xFF; result[i + 1] = (byte)0xFD; } } } return result; } static byte[] encodeUTF16LE(byte coder, byte[] val) { byte[] result; if(coder == LATIN1) { result = new byte[val.length << 1]; for(int i = 0; i < val.length; i++) { result[i << 1] = val[i]; } } else { result = val.clone(); if(StringUTF16.LO_BYTE_SHIFT == 0) { // val is encoded using big-endian UTF-16 // Convert result to little-endian UTF-16 from big-endian UTF-16 by byte swapping byteSwapUTF16(result, 0); } for(int i = 0; i < result.length; i += 2) { int b2 = Byte.toUnsignedInt(result[i + 1]); int b4 = result.length - i >= 4 ? Byte.toUnsignedInt(result[i + 3]) : -1; if(b2 >= 0xD8 && b2 <= 0xDF) { if(b2 <= 0xDB && b4 >= 0xDC && b4 <= 0xDF) { // UTF-16 surrogate pair encountered // Advance i to the position of the low surrogate i += 2; // Continue the loop past the low surrogate continue; } // Unpaired surrogate character encountered // Replace unpaired surrogate character with U+FFFD result[i] = (byte)0xFD; result[i + 1] = (byte)0xFF; } } } return result; } static Result bomDetectDecodeUTF16(byte[] ba, int off, int len) { boolean bigEndian = true; if(len >= 2) { int b1 = Byte.toUnsignedInt(ba[off]); int b2 = Byte.toUnsignedInt(ba[off + 1]); if(b1 == 0xFE && b2 == 0xFF) { // Big-endian BOM detected off += 2; len -= 2; } else if(b1 == 0xFF && b2 == 0xFE) { // Little-endian BOM detected off += 2; len -= 2; bigEndian = false; } } return decodeUTF16(ba, off, len, bigEndian); } static Result decodeUTF16(byte[] ba, int off, int len, boolean bigEndian) { Result result = new Result(); if(len == 0) { return result.with(); } byte[] decodedArr; if(COMPACT_STRINGS && (len & 1) == 0) { // Check for non-Latin1 characters boolean containsNonLatin1 = false; for(int i = 0; i < len; i += 2) { if(ba[off + i + (bigEndian ? 0 : 1)] != 0) { containsNonLatin1 = true; break; } } // If the input only contains Latin1 characters, copy the source characters // to a Latin1-encoded byte array, and return the decoded text. if(!containsNonLatin1) { decodedArr = new byte[len >> 1]; for(int i = 0; i < decodedArr.length; i++) { decodedArr[i] = ba[off + (i << 1) + (bigEndian ? 1 : 0)]; } return result.with(decodedArr, LATIN1); } } decodedArr = new byte[len + (len & 1)]; System.arraycopy(ba, off, decodedArr, 0, len); if(StringUTF16.HI_BYTE_SHIFT != (bigEndian ? 8 : 0)) { // Input byte order does not match system byte order // Byte swap decodedArr so that decodedArr is in system byte order byteSwapUTF16(decodedArr, 0); } // decodedArr is now in system byte order if((len & 1) != 0) { // If len is odd, then there is a malformed character at the end. // Replace the last character in decodedArr with U+FFFD if this is the case. StringUTF16.putChar(decodedArr, (decodedArr.length >> 1) - 1, 0xFFFD); // Decrement len by 1 to make len even. len--; } // len is now even // charLen is equal to the number of UTF-16 characters in decodedArr int charLen = len >> 1; // replace the reversed BOM and unpaired surrogates with U+FFFD for(int i = 0; i < charLen; i++) { char ch = StringUTF16.getChar(decodedArr, i); if(charLen - i >= 2 && Character.isSurrogatePair(ch, StringUTF16.getChar(decodedArr, i + 1)) { // Surrogate pair detected // Increment i to the position of the low surrogate i++; // Continue the loop continue; } if(ch == (char)0xFFFE || Character.isSurrogate(ch)) { // Reversed BOM or unpaired surrogate encountered // Replace ch with 0xFFFD StringUTF16.putChar(decodedArr, i, (char)0xFFFD); } } // If compact strings are enabled, return a Latin1-encoded result if the result // does not contain any non-Latin-1 characters. if(COMPACT_STRINGS) { byte[] compressedArr = StringUTF16.compress(decodedArr, 0, decodedArr.len); if(compressedArr != null) { return result.with(compressedArr, LATIN1); } } return result.with(decodedArr, UTF16); } private static class StringDecoderUTF_16 extends StringDecoder { StringDecoderUTF_16(Charset cs, String rcn) { super(cs, rcn); } Result decode(byte[] ba, int off, int len) { return bomDetectDecodeUTF16(ba, off, len); } } private static class StringDecoderUTF_16LE extends StringDecoder { StringDecoderUTF_16(Charset cs, String rcn) { super(cs, rcn); } Result decode(byte[] ba, int off, int len) { return decodeUTF16(ba, off, len, false); } } private static class StringDecoderUTF_16BE extends StringDecoder { StringDecoderUTF_16(Charset cs, String rcn) { super(cs, rcn); } Result decode(byte[] ba, int off, int len) { return decodeUTF16(ba, off, len, true); } } static Result decode(String charsetName, byte[] ba, int off, int len) throws UnsupportedEncodingException { StringDecoder sd = deref(decoder); String csn = (charsetName == null) ? "ISO-8859-1" : charsetName; if ((sd == null) || !(csn.equals(sd.requestedCharsetName()) || csn.equals(sd.charsetName()))) { sd = null; try { Charset cs = lookupCharset(csn); if (cs != null) { if (cs == UTF_8) { sd = new StringDecoderUTF8(cs, csn); } else if (cs == ISO_8859_1) { sd = new StringDecoder8859_1(cs, csn); } else if(cs == StandardCharsets.UTF_16) { sd = new StringDecoderUTF_16(cs, csn); } else if(cs == StandardCharsets.UTF_16LE) { sd = new StringDecoderUTF_16LE(cs, csn); } else if(cs == StandardCharsets.UTF_16BE) { sd = new StringDecoderUTF_16BE(cs, csn); } else { sd = new StringDecoder(cs, csn); } } } catch (IllegalCharsetNameException x) {} if (sd == null) throw new UnsupportedEncodingException(csn); set(decoder, sd); } return sd.decode(ba, off, len); } } static byte[] encode(Charset cs, byte coder, byte[] val) { if (cs == UTF_8) { return encodeUTF8(coder, val); } else if (cs == ISO_8859_1) { return encode8859_1(coder, val); } else if (cs == US_ASCII) { return encodeASCII(coder, val); } else if (cs == StandardCharsets.UTF_16 || cs == StandardCharsets.UTF_16BE){ return encodeUTF16BE(coder, val, cs == StandardCharsets.UTF_16); } else if (cs == StandardCharsets.UTF_16LE) { return encodeUTF16LE(coder, val); } CharsetEncoder ce = cs.newEncoder(); // fastpath for ascii compatible if (coder == LATIN1 && (((ce instanceof ArrayEncoder) && ((ArrayEncoder)ce).isASCIICompatible() && !hasNegatives(val, 0, val.length)))) { return Arrays.copyOf(val, val.length); } int len = val.length >> coder; // assume LATIN1=0/UTF16=1; int en = scale(len, ce.maxBytesPerChar()); byte[] ba = new byte[en]; if (len == 0) { return ba; } boolean isTrusted = System.getSecurityManager() == null || cs.getClass().getClassLoader0() == null; ce.onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE) .reset(); if (ce instanceof ArrayEncoder) { if (!isTrusted) { val = Arrays.copyOf(val, val.length); } int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba) : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba); if (blen != -1) { return safeTrim(ba, blen, isTrusted); } } char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) : StringUTF16.toChars(val); ByteBuffer bb = ByteBuffer.wrap(ba); CharBuffer cb = CharBuffer.wrap(ca, 0, len); try { CoderResult cr = ce.encode(cb, bb, true); if (!cr.isUnderflow()) cr.throwException(); cr = ce.flush(bb); if (!cr.isUnderflow()) cr.throwException(); } catch (CharacterCodingException x) { throw new Error(x); } return safeTrim(ba, bb.position(), isTrusted); }