Repository: flume Updated Branches: refs/heads/trunk f3b6ceb6a -> 344e0acca
FLUME-2215. ResettableFileInputStream can't support ucs-4 character (Alexandre Dutra via Hari) Project: http://git-wip-us.apache.org/repos/asf/flume/repo Commit: http://git-wip-us.apache.org/repos/asf/flume/commit/344e0acc Tree: http://git-wip-us.apache.org/repos/asf/flume/tree/344e0acc Diff: http://git-wip-us.apache.org/repos/asf/flume/diff/344e0acc Branch: refs/heads/trunk Commit: 344e0accae5675fd3d14b8414531528607865aae Parents: f3b6ceb Author: Hari Shreedharan <[email protected]> Authored: Wed May 27 09:58:23 2015 -0700 Committer: Hari Shreedharan <[email protected]> Committed: Wed May 27 09:58:23 2015 -0700 ---------------------------------------------------------------------- .../ResettableFileInputStream.java | 151 ++++++++- .../TestResettableFileInputStream.java | 326 ++++++++++++++----- 2 files changed, 375 insertions(+), 102 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/flume/blob/344e0acc/flume-ng-core/src/main/java/org/apache/flume/serialization/ResettableFileInputStream.java ---------------------------------------------------------------------- diff --git a/flume-ng-core/src/main/java/org/apache/flume/serialization/ResettableFileInputStream.java b/flume-ng-core/src/main/java/org/apache/flume/serialization/ResettableFileInputStream.java index 587ab29..622c09f 100644 --- a/flume-ng-core/src/main/java/org/apache/flume/serialization/ResettableFileInputStream.java +++ b/flume-ng-core/src/main/java/org/apache/flume/serialization/ResettableFileInputStream.java @@ -37,13 +37,49 @@ import java.nio.charset.CoderResult; import java.nio.charset.CodingErrorAction; /** - * <p/>This class makes the following assumptions: + * <p>This class makes the following assumptions:</p> * <ol> * <li>The underlying file is not changing while it is being read</li> * </ol> * - * <p/>The ability to {@link #reset()} is dependent on the underlying {@link - * PositionTracker} instance's durability semantics. + * <p>The ability to {@link #reset()} is dependent on the underlying {@link + * PositionTracker} instance's durability semantics.</p> + * + * <p><strong>A note on surrogate pairs:</strong></p> + * + * <p>The logic for decoding surrogate pairs is as follows: + * If no character has been decoded by a "normal" pass, and the buffer still has remaining bytes, + * then an attempt is made to read 2 characters in one pass. + * If it succeeds, then the first char (high surrogate) is returned; + * the second char (low surrogate) is recorded internally, + * and is returned at the next call to {@link #readChar()}. + * If it fails, then it is assumed that EOF has been reached.</p> + * + * <p>Impacts on position, mark and reset: when a surrogate pair is decoded, the position + * is incremented by the amount of bytes taken to decode the <em>entire</em> pair (usually, 4). + * This is the most reasonable choice since it would not be advisable + * to reset a stream to a position pointing to the second char in a pair of surrogates: + * such a dangling surrogate would not be properly decoded without its counterpart.</p> + * + * <p>Thus the behaviour of mark and reset is as follows:</p> + * + * <ol> + * <li>If {@link #mark()} is called after a high surrogate pair has been returned by {@link #readChar()}, + * the marked position will be that of the character <em>following</em> the low surrogate, + * <em>not</em> that of the low surrogate itself.</li> + * <li>If {@link #reset()} is called after a high surrogate pair has been returned by {@link #readChar()}, + * the low surrogate is always returned by the next call to {@link #readChar()}, + * <em>before</em> the stream is actually reset to the last marked position.</li> + * </ol> + * + * <p>This ensures that no dangling high surrogate could ever be read as long as + * the same instance is used to read the whole pair. <strong>However, if {@link #reset()} + * is called after a high surrogate pair has been returned by {@link #readChar()}, + * and a new instance of ResettableFileInputStream is used to resume reading, + * then the low surrogate char will be lost, + * resulting in a corrupted sequence of characters (dangling high surrogate).</strong> + * This situation is hopefully extremely unlikely to happen in real life. + * </p> */ @InterfaceAudience.Private @InterfaceStability.Evolving @@ -54,6 +90,14 @@ public class ResettableFileInputStream extends ResettableInputStream public static final int DEFAULT_BUF_SIZE = 16384; + /** + * The minimum acceptable buffer size to store bytes read + * from the underlying file. A minimum size of 8 ensures that the + * buffer always has enough space to contain multi-byte characters, + * including special sequences such as surrogate pairs, Byte Order Marks, etc. + */ + public static final int MIN_BUF_SIZE = 8; + private final File file; private final PositionTracker tracker; private final FileInputStream in; @@ -67,6 +111,21 @@ public class ResettableFileInputStream extends ResettableInputStream private long syncPosition; private int maxCharWidth; + + /** + * Whether this instance holds a low surrogate character. + */ + private boolean hasLowSurrogate = false; + + /** + * A low surrrgate character read from a surrogate pair. + * When a surrogate pair is found, the high (first) surrogate pair + * is returned upon a call to {@link #read()}, + * while the low (second) surrogate remains stored in memory, + * to be returned at the next call to {@link #read()}. + */ + private char lowSurrogate; + /** * * @param file @@ -75,7 +134,8 @@ public class ResettableFileInputStream extends ResettableInputStream * @param tracker * PositionTracker implementation to make offset position durable * - * @throws FileNotFoundException + * @throws FileNotFoundException If the file to read does not exist + * @throws IOException If the position reported by the tracker cannot be sought */ public ResettableFileInputStream(File file, PositionTracker tracker) throws IOException { @@ -91,12 +151,19 @@ public class ResettableFileInputStream extends ResettableInputStream * PositionTracker implementation to make offset position durable * * @param bufSize - * Size of the underlying buffer used for input + * Size of the underlying buffer used for input. If lesser than {@link #MIN_BUF_SIZE}, + * a buffer of length {@link #MIN_BUF_SIZE} will be created instead. * * @param charset * Character set used for decoding text, as necessary * - * @throws FileNotFoundException + * @param decodeErrorPolicy + * A {@link DecodeErrorPolicy} instance to determine how + * the decoder should behave in case of malformed input and/or + * unmappable character. + * + * @throws FileNotFoundException If the file to read does not exist + * @throws IOException If the position reported by the tracker cannot be sought */ public ResettableFileInputStream(File file, PositionTracker tracker, int bufSize, Charset charset, DecodeErrorPolicy decodeErrorPolicy) @@ -105,16 +172,27 @@ public class ResettableFileInputStream extends ResettableInputStream this.tracker = tracker; this.in = new FileInputStream(file); this.chan = in.getChannel(); - this.buf = ByteBuffer.allocateDirect(bufSize); + this.buf = ByteBuffer.allocateDirect(Math.max(bufSize, MIN_BUF_SIZE)); buf.flip(); this.byteBuf = new byte[1]; // single byte - this.charBuf = CharBuffer.allocate(1); // single char + this.charBuf = CharBuffer.allocate(2); // two chars for surrogate pairs charBuf.flip(); this.fileSize = file.length(); this.decoder = charset.newDecoder(); this.position = 0; this.syncPosition = 0; - this.maxCharWidth = (int)Math.ceil(charset.newEncoder().maxBytesPerChar()); + if(charset.name().startsWith("UTF-8")) { + // some JDKs wrongly report 3 bytes max + this.maxCharWidth = 4; + } else if(charset.name().startsWith("UTF-16")) { + // UTF_16BE and UTF_16LE wrongly report 2 bytes max + this.maxCharWidth = 4; + } else if(charset.name().startsWith("UTF-32")) { + // UTF_32BE and UTF_32LE wrongly report 4 bytes max + this.maxCharWidth = 8; + } else { + this.maxCharWidth = (int) Math.ceil(charset.newEncoder().maxBytesPerChar()); + } CodingErrorAction errorAction; switch (decodeErrorPolicy) { @@ -173,6 +251,14 @@ public class ResettableFileInputStream extends ResettableInputStream @Override public synchronized int readChar() throws IOException { + + // Check whether we are in the middle of a surrogate pair, + // in which case, return the last (low surrogate) char of the pair. + if(hasLowSurrogate) { + hasLowSurrogate = false; + return lowSurrogate; + } + // The decoder can have issues with multi-byte characters. // This check ensures that there are at least maxCharWidth bytes in the buffer // before reaching EOF. @@ -184,6 +270,7 @@ public class ResettableFileInputStream extends ResettableInputStream int start = buf.position(); charBuf.clear(); + charBuf.limit(1); boolean isEndOfInput = false; if (position >= fileSize) { @@ -198,20 +285,50 @@ public class ResettableFileInputStream extends ResettableInputStream int delta = buf.position() - start; charBuf.flip(); + + // Found a single char if (charBuf.hasRemaining()) { char c = charBuf.get(); - // don't increment the persisted location if we are in between a - // surrogate pair, otherwise we may never recover if we seek() to this - // location! - incrPosition(delta, !Character.isHighSurrogate(c)); + incrPosition(delta, true); return c; + } - // there may be a partial character in the decoder buffer - } else { - incrPosition(delta, false); - return -1; + // Found nothing, but the byte buffer has not been entirely consumed. + // This situation denotes the presence of a surrogate pair + // that can only be decoded if we have a 2-char buffer. + if(buf.hasRemaining()) { + charBuf.clear(); + // increase the limit to 2 + charBuf.limit(2); + // decode 2 chars in one pass + res = decoder.decode(buf, charBuf, isEndOfInput); + if (res.isMalformed() || res.isUnmappable()) { + res.throwException(); + } + charBuf.flip(); + // Check if we successfully decoded 2 chars + if (charBuf.remaining() == 2) { + char highSurrogate = charBuf.get(); + // save second (low surrogate) char for later consumption + lowSurrogate = charBuf.get(); + // Check if we really have a surrogate pair + if( ! Character.isHighSurrogate(highSurrogate) || ! Character.isLowSurrogate(lowSurrogate)) { + // This should only happen in case of bad sequences (dangling surrogate, etc.) + logger.warn("Decoded a pair of chars, but it does not seem to be a surrogate pair: {} {}", (int)highSurrogate, (int)lowSurrogate); + } + hasLowSurrogate = true; + // consider the pair as a single unit and increment position normally + delta = buf.position() - start; + incrPosition(delta, true); + // return the first (high surrogate) char of the pair + return highSurrogate; + } } + // end of file + incrPosition(delta, false); + return -1; + } private void refillBuf() throws IOException { http://git-wip-us.apache.org/repos/asf/flume/blob/344e0acc/flume-ng-core/src/test/java/org/apache/flume/serialization/TestResettableFileInputStream.java ---------------------------------------------------------------------- diff --git a/flume-ng-core/src/test/java/org/apache/flume/serialization/TestResettableFileInputStream.java b/flume-ng-core/src/test/java/org/apache/flume/serialization/TestResettableFileInputStream.java index d1240fb..2c559db 100644 --- a/flume-ng-core/src/test/java/org/apache/flume/serialization/TestResettableFileInputStream.java +++ b/flume-ng-core/src/test/java/org/apache/flume/serialization/TestResettableFileInputStream.java @@ -18,7 +18,6 @@ package org.apache.flume.serialization; import com.google.common.base.Charsets; -import com.google.common.base.Strings; import com.google.common.collect.Lists; import com.google.common.io.Files; import junit.framework.Assert; @@ -29,7 +28,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import static org.junit.Assert.*; -import static org.junit.Assert.assertEquals; import java.io.BufferedOutputStream; import java.io.ByteArrayOutputStream; @@ -89,25 +87,75 @@ public class TestResettableFileInputStream { in.close(); } + /** * Ensure that we can process lines that contain multi byte characters in weird places * such as at the end of a buffer. * @throws IOException */ @Test - public void testWideCharRead() throws IOException { - String output = wideCharFileInit(file, Charsets.UTF_8); - - PositionTracker tracker = new DurablePositionTracker(meta, file.getPath()); - ResettableInputStream in = new ResettableFileInputStream(file, tracker); + public void testMultiByteCharRead() throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + out.write("1234567".getBytes(Charsets.UTF_8)); + // write a multi byte char encompassing buffer boundaries + generateUtf83ByteSequence(out); + // buffer now contains 8 chars and 10 bytes total + Files.write(out.toByteArray(), file); + ResettableInputStream in = initInputStream(8, Charsets.UTF_8, DecodeErrorPolicy.FAIL); + String result = readLine(in, 8); + assertEquals("1234567\u0A93\n", result); + } - String result = readLine(in, output.length()); - assertEquals(output, result); + /** + * Ensure that we can process UTF-8 lines that contain surrogate pairs + * even if they appear astride buffer boundaries. + * @throws IOException + */ + @Test + public void testUtf8SurrogatePairRead() throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + out.write("1234567".getBytes(Charsets.UTF_8)); + generateUtf8SurrogatePairSequence(out); + // buffer now contains 9 chars (7 "normal" and 2 surrogates) and 11 bytes total + // surrogate pair will encompass buffer boundaries + Files.write(out.toByteArray(), file); + ResettableInputStream in = initInputStream(8, Charsets.UTF_8, DecodeErrorPolicy.FAIL); + String result = readLine(in, 9); + assertEquals("1234567\uD83D\uDE18\n", result); + } - String afterEOF = readLine(in, output.length()); - assertNull(afterEOF); + /** + * Ensure that we can process UTF-16 lines that contain surrogate pairs, even + * preceded by a Byte Order Mark (BOM). + * @throws IOException + */ + @Test + public void testUtf16BOMAndSurrogatePairRead() throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + generateUtf16SurrogatePairSequence(out); + // buffer now contains 1 BOM and 2 chars (1 surrogate pair) and 6 bytes total (including 2-byte BOM) + Files.write(out.toByteArray(), file); + ResettableInputStream in = initInputStream(8, Charsets.UTF_16, DecodeErrorPolicy.FAIL); + String result = readLine(in, 2); + assertEquals("\uD83D\uDE18\n", result); + } - in.close(); + /** + * Ensure that we can process Shift_JIS lines that contain multi byte Japanese chars + * even if they appear astride buffer boundaries. + * @throws IOException + */ + @Test + public void testShiftJisSurrogateCharRead() throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + out.write("1234567".getBytes(Charset.forName("Shift_JIS"))); + // write a multi byte char encompassing buffer boundaries + generateShiftJis2ByteSequence(out); + // buffer now contains 8 chars and 10 bytes total + Files.write(out.toByteArray(), file); + ResettableInputStream in = initInputStream(8, Charset.forName("Shift_JIS"), DecodeErrorPolicy.FAIL); + String result = readLine(in, 8); + assertEquals("1234567\u4E9C\n", result); } @Test(expected = MalformedInputException.class) @@ -178,60 +226,6 @@ public class TestResettableFileInputStream { assertEquals("Invalid: (X)\n".replaceAll("X", "\ufffd"), sb.toString()); } - private ResettableInputStream initUtf8DecodeTest(DecodeErrorPolicy policy) - throws IOException { - writeBigBadUtf8Sequence(file); - return initInputStream(policy); - } - - private ResettableInputStream initInputStream(DecodeErrorPolicy policy) - throws IOException { - PositionTracker tracker = new DurablePositionTracker(meta, file.getPath()); - ResettableInputStream in = new ResettableFileInputStream(file, tracker, - 2048, Charsets.UTF_8, policy); - return in; - } - - private void writeBigBadUtf8Sequence(File file) throws IOException { - ByteArrayOutputStream out = new ByteArrayOutputStream(); - generateUtf8Latin1Sequence(out); - generateUtf8OverlyLongSequence(out); - generateUtf8NonUnicodeSequence(out); - Files.write(out.toByteArray(), file); - } - - private void generateUtf8OverlyLongSequence(OutputStream out) - throws IOException { - out.write("Long: (".getBytes(Charsets.UTF_8)); - // Overly-long slash character should not be accepted. - out.write(new byte[] { (byte)0xe0, (byte)0x80, (byte)0xaf }); - out.write(")\n".getBytes(Charsets.UTF_8)); - } - - private void generateUtf8NonUnicodeSequence(OutputStream out) - throws IOException { - out.write("NonUnicode: (".getBytes(Charsets.UTF_8)); - // This is a valid 5-octet sequence but is not Unicode - out.write(new byte[] { (byte)0xf8, (byte)0xa1, (byte)0xa1, (byte)0xa1, - (byte)0xa1 } ); - out.write(")\n".getBytes(Charsets.UTF_8)); - } - - private void generateUtf8Latin1Sequence(OutputStream out) throws IOException { - out.write("Latin1: (".getBytes(Charsets.UTF_8)); - // This is "e" with an accent in Latin-1 - out.write(new byte[] { (byte)0xe9 } ); - out.write(")\n".getBytes(Charsets.UTF_8)); - } - - private void generateLatin1InvalidSequence(OutputStream out) - throws IOException { - out.write("Invalid: (".getBytes(Charsets.UTF_8)); - // Not a valid character in Latin 1. - out.write(new byte[] { (byte)0x81 } ); - out.write(")\n".getBytes(Charsets.UTF_8)); - } - /** * Ensure a reset() brings us back to the default mark (beginning of file) * @throws IOException @@ -291,6 +285,55 @@ public class TestResettableFileInputStream { in.close(); } + /** + * Ensure that surrogate pairs work well with mark/reset. + * @throws IOException + */ + @Test + public void testMarkResetWithSurrogatePairs() throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + out.write("foo".getBytes(Charsets.UTF_8)); + generateUtf8SurrogatePairSequence(out); + out.write("bar".getBytes(Charsets.UTF_8)); + Files.write(out.toByteArray(), file); + PositionTracker tracker = new DurablePositionTracker(meta, file.getPath()); + ResettableInputStream in = new ResettableFileInputStream(file, tracker); + Assert.assertEquals('f', in.readChar()); + Assert.assertEquals('o', in.readChar()); + in.mark(); + Assert.assertEquals('o', in.readChar()); + // read high surrogate + Assert.assertEquals('\ud83d', in.readChar()); + // call reset in the middle of a surrogate pair + in.reset(); + // will read low surrogate *before* reverting back to mark, to ensure + // surrogate pair is properly read + Assert.assertEquals('\ude18', in.readChar()); + // now back to marked position + Assert.assertEquals('o', in.readChar()); + // read high surrogate again + Assert.assertEquals('\ud83d', in.readChar()); + // call mark in the middle of a surrogate pair: + // will mark the position *after* the pair, *not* low surrogate's position + in.mark(); + // will reset to the position *after* the pair + in.reset(); + // read low surrogate normally despite of reset being called + // so that the pair is entirely read + Assert.assertEquals('\ude18', in.readChar()); + Assert.assertEquals('b', in.readChar()); + Assert.assertEquals('a', in.readChar()); + // will reset to the position *after* the pair + in.reset(); + Assert.assertEquals('b', in.readChar()); + Assert.assertEquals('a', in.readChar()); + Assert.assertEquals('r', in.readChar()); + Assert.assertEquals(-1, in.readChar()); + in.close(); + tracker.close(); // redundant + } + + @Test public void testResume() throws IOException { List<String> expected = multiLineFileInit(file, Charsets.UTF_8); @@ -322,6 +365,62 @@ public class TestResettableFileInputStream { Assert.assertEquals(result3, result3a); } + /** + * Ensure that surrogate pairs work well when resuming + * reading. Specifically, this test brings up special situations + * where a surrogate pair cannot be correctly decoded because + * the second character is lost. + * + * @throws IOException + */ + @Test + public void testResumeWithSurrogatePairs() throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + out.write("foo".getBytes(Charsets.UTF_8)); + generateUtf8SurrogatePairSequence(out); + out.write("bar".getBytes(Charsets.UTF_8)); + Files.write(out.toByteArray(), file); + PositionTracker tracker = new DurablePositionTracker(meta, file.getPath()); + ResettableInputStream in = new ResettableFileInputStream(file, tracker); + Assert.assertEquals('f', in.readChar()); + Assert.assertEquals('o', in.readChar()); + in.mark(); + Assert.assertEquals('o', in.readChar()); + // read high surrogate + Assert.assertEquals('\ud83d', in.readChar()); + // call reset in the middle of a surrogate pair + in.reset(); + // close RIS - this will cause the low surrogate char + // stored in-memory to be lost + in.close(); + tracker.close(); // redundant + // create new Tracker & RIS + tracker = new DurablePositionTracker(meta, file.getPath()); + in = new ResettableFileInputStream(file, tracker); + // low surrogate char is now lost - resume from marked position + Assert.assertEquals('o', in.readChar()); + // read high surrogate again + Assert.assertEquals('\ud83d', in.readChar()); + // call mark in the middle of a surrogate pair: + // will mark the position *after* the pair, *not* low surrogate's position + in.mark(); + // close RIS - this will cause the low surrogate char + // stored in-memory to be lost + in.close(); + tracker.close(); // redundant + // create new Tracker & RIS + tracker = new DurablePositionTracker(meta, file.getPath()); + in = new ResettableFileInputStream(file, tracker); + // low surrogate char is now lost - resume from marked position + Assert.assertEquals('b', in.readChar()); + Assert.assertEquals('a', in.readChar()); + Assert.assertEquals('r', in.readChar()); + Assert.assertEquals(-1, in.readChar()); + in.close(); + tracker.close(); // redundant + } + + @Test public void testSeek() throws IOException { int NUM_LINES = 1000; @@ -375,28 +474,85 @@ public class TestResettableFileInputStream { assertEquals(11, Integer.parseInt(readLine(in, LINE_LEN).substring(0, 10))); } - /** - * Helper method that generates a line to test if parts of multi-byte characters on the - * edge of a buffer are handled properly. - */ - private static String generateWideCharLine(){ - String s = "éllo Wörld!\n"; - int size = (ResettableFileInputStream.DEFAULT_BUF_SIZE - 1) + s.length(); - return Strings.padStart(s, size , 'H'); + private ResettableInputStream initUtf8DecodeTest(DecodeErrorPolicy policy) + throws IOException { + writeBigBadUtf8Sequence(file); + return initInputStream(policy); } - /** - * Creates a file that contains a line that contains wide characters - * @param file - * @param charset - * @return - * @throws IOException - */ - private static String wideCharFileInit(File file, Charset charset) + private ResettableInputStream initInputStream(DecodeErrorPolicy policy) throws IOException { - String output = generateWideCharLine(); - Files.write(output.getBytes(charset), file); - return output; + return initInputStream(2048, Charsets.UTF_8, policy); + } + + private ResettableInputStream initInputStream(int bufferSize, Charset charset, DecodeErrorPolicy policy) + throws IOException { + PositionTracker tracker = new DurablePositionTracker(meta, file.getPath()); + ResettableInputStream in = new ResettableFileInputStream(file, tracker, + bufferSize, charset, policy); + return in; + } + + private void writeBigBadUtf8Sequence(File file) throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + generateUtf8Latin1Sequence(out); + generateUtf8OverlyLongSequence(out); + generateUtf8NonUnicodeSequence(out); + Files.write(out.toByteArray(), file); + } + + private void generateUtf8OverlyLongSequence(OutputStream out) + throws IOException { + out.write("Long: (".getBytes(Charsets.UTF_8)); + // Overly-long slash character should not be accepted. + out.write(new byte[] { (byte)0xe0, (byte)0x80, (byte)0xaf }); + out.write(")\n".getBytes(Charsets.UTF_8)); + } + + private void generateUtf8NonUnicodeSequence(OutputStream out) + throws IOException { + out.write("NonUnicode: (".getBytes(Charsets.UTF_8)); + // This is a valid 5-octet sequence but is not Unicode + out.write(new byte[]{(byte) 0xf8, (byte) 0xa1, (byte) 0xa1, (byte) 0xa1, + (byte) 0xa1}); + out.write(")\n".getBytes(Charsets.UTF_8)); + } + + private void generateUtf8Latin1Sequence(OutputStream out) throws IOException { + out.write("Latin1: (".getBytes(Charsets.UTF_8)); + // This is "e" with an accent in Latin-1 + out.write(new byte[] { (byte)0xe9 } ); + out.write(")\n".getBytes(Charsets.UTF_8)); + } + + private void generateLatin1InvalidSequence(OutputStream out) + throws IOException { + out.write("Invalid: (".getBytes(Charsets.UTF_8)); + // Not a valid character in Latin 1. + out.write(new byte[] { (byte)0x81 } ); + out.write(")\n".getBytes(Charsets.UTF_8)); + } + + private void generateUtf8SurrogatePairSequence(OutputStream out) throws IOException { + // U+1F618 (UTF-8: f0 9f 98 98) FACE THROWING A KISS + out.write(new byte[]{(byte) 0xF0, (byte) 0x9F, (byte) 0x98, (byte) 0x98}); + } + + private void generateUtf16SurrogatePairSequence(OutputStream out) throws IOException { + // BOM + out.write(new byte[]{(byte) 0xFE, (byte) 0xFF}); + // U+1F618 (UTF-16: d83d de18) FACE THROWING A KISS + out.write(new byte[]{(byte) 0xD8, (byte) 0x3D, (byte) 0xDE, (byte) 0x18}); + } + + private void generateUtf83ByteSequence(OutputStream out) throws IOException { + // U+0A93 (UTF-8: e0 aa 93) GUJARATI LETTER O + out.write(new byte[]{(byte) 0xe0, (byte) 0xaa, (byte) 0x93}); + } + + private void generateShiftJis2ByteSequence(OutputStream out) throws IOException { + //U+4E9C (Shift JIS: 88 9f) CJK UNIFIED IDEOGRAPH + out.write(new byte[]{(byte) 0x88, (byte) 0x9f}); } /**
