flume git commit: FLUME-2215. ResettableFileInputStream can't support ucs-4 character

hshreedharan Wed, 27 May 2015 10:09:57 -0700

Repository: flume
Updated Branches:
  refs/heads/trunk f3b6ceb6a -> 344e0acca



FLUME-2215. ResettableFileInputStream can't support ucs-4 character

(Alexandre Dutra via Hari)


Project: http://git-wip-us.apache.org/repos/asf/flume/repo
Commit: http://git-wip-us.apache.org/repos/asf/flume/commit/344e0acc
Tree: http://git-wip-us.apache.org/repos/asf/flume/tree/344e0acc
Diff: http://git-wip-us.apache.org/repos/asf/flume/diff/344e0acc

Branch: refs/heads/trunk
Commit: 344e0accae5675fd3d14b8414531528607865aae
Parents: f3b6ceb
Author: Hari Shreedharan <[email protected]>
Authored: Wed May 27 09:58:23 2015 -0700
Committer: Hari Shreedharan <[email protected]>
Committed: Wed May 27 09:58:23 2015 -0700

----------------------------------------------------------------------
 .../ResettableFileInputStream.java              | 151 ++++++++-
 .../TestResettableFileInputStream.java          | 326 ++++++++++++++-----
 2 files changed, 375 insertions(+), 102 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/flume/blob/344e0acc/flume-ng-core/src/main/java/org/apache/flume/serialization/ResettableFileInputStream.java
----------------------------------------------------------------------
diff --git 
a/flume-ng-core/src/main/java/org/apache/flume/serialization/ResettableFileInputStream.java
 
b/flume-ng-core/src/main/java/org/apache/flume/serialization/ResettableFileInputStream.java
index 587ab29..622c09f 100644
--- 
a/flume-ng-core/src/main/java/org/apache/flume/serialization/ResettableFileInputStream.java
+++ 
b/flume-ng-core/src/main/java/org/apache/flume/serialization/ResettableFileInputStream.java
@@ -37,13 +37,49 @@ import java.nio.charset.CoderResult;
 import java.nio.charset.CodingErrorAction;
 
 /**
- * <p/>This class makes the following assumptions:
+ * <p>This class makes the following assumptions:</p>
  * <ol>
  *   <li>The underlying file is not changing while it is being read</li>
  * </ol>
  *
- * <p/>The ability to {@link #reset()} is dependent on the underlying {@link
- * PositionTracker} instance's durability semantics.
+ * <p>The ability to {@link #reset()} is dependent on the underlying {@link
+ * PositionTracker} instance's durability semantics.</p>
+ *
+ * <p><strong>A note on surrogate pairs:</strong></p>
+ *
+ * <p>The logic for decoding surrogate pairs is as follows:
+ * If no character has been decoded by a "normal" pass, and the buffer still 
has remaining bytes,
+ * then an attempt is made to read 2 characters in one pass.
+ * If it succeeds, then the first char (high surrogate) is returned;
+ * the second char (low surrogate) is recorded internally,
+ * and is returned at the next call to {@link #readChar()}.
+ * If it fails, then it is assumed that EOF has been reached.</p>
+ *
+ * <p>Impacts on position, mark and reset: when a surrogate pair is decoded, 
the position
+ * is incremented by the amount of bytes taken to decode the <em>entire</em> 
pair (usually, 4).
+ * This is the most reasonable choice since it would not be advisable
+ * to reset a stream to a position pointing to the second char in a pair of 
surrogates:
+ * such a dangling surrogate would not be properly decoded without its 
counterpart.</p>
+ *
+ * <p>Thus the behaviour of mark and reset is as follows:</p>
+ *
+ * <ol>
+ *   <li>If {@link #mark()} is called after a high surrogate pair has been 
returned by {@link #readChar()},
+ *   the marked position will be that of the character <em>following</em> the 
low surrogate,
+ *   <em>not</em> that of the low surrogate itself.</li>
+ *   <li>If {@link #reset()} is called after a high surrogate pair has been 
returned by {@link #readChar()},
+ *   the low surrogate is always returned by the next call to {@link 
#readChar()},
+ *   <em>before</em> the stream is actually reset to the last marked 
position.</li>
+ * </ol>
+ *
+ * <p>This ensures that no dangling high surrogate could ever be read as long 
as
+ * the same instance is used to read the whole pair. <strong>However, if 
{@link #reset()}
+ * is called after a high surrogate pair has been returned by {@link 
#readChar()},
+ * and a new instance of ResettableFileInputStream is used to resume reading,
+ * then the low surrogate char will be lost,
+ * resulting in a corrupted sequence of characters (dangling high 
surrogate).</strong>
+ * This situation is hopefully extremely unlikely to happen in real life.
+ * </p>
  */
 @InterfaceAudience.Private
 @InterfaceStability.Evolving
@@ -54,6 +90,14 @@ public class ResettableFileInputStream extends 
ResettableInputStream
 
   public static final int DEFAULT_BUF_SIZE = 16384;
 
+  /**
+   * The minimum acceptable buffer size to store bytes read
+   * from the underlying file. A minimum size of 8 ensures that the
+   * buffer always has enough space to contain multi-byte characters,
+   * including special sequences such as surrogate pairs, Byte Order Marks, 
etc.
+   */
+  public static final int MIN_BUF_SIZE = 8;
+
   private final File file;
   private final PositionTracker tracker;
   private final FileInputStream in;
@@ -67,6 +111,21 @@ public class ResettableFileInputStream extends 
ResettableInputStream
   private long syncPosition;
   private int maxCharWidth;
 
+
+  /**
+   * Whether this instance holds a low surrogate character.
+   */
+  private boolean hasLowSurrogate = false;
+
+  /**
+   * A low surrrgate character read from a surrogate pair.
+   * When a surrogate pair is found, the high (first) surrogate pair
+   * is returned upon a call to {@link #read()},
+   * while the low (second) surrogate remains stored in memory,
+   * to be returned at the next call to {@link #read()}.
+   */
+  private char lowSurrogate;
+
   /**
    *
    * @param file
@@ -75,7 +134,8 @@ public class ResettableFileInputStream extends 
ResettableInputStream
    * @param tracker
    *        PositionTracker implementation to make offset position durable
    *
-   * @throws FileNotFoundException
+   * @throws FileNotFoundException If the file to read does not exist
+   * @throws IOException If the position reported by the tracker cannot be 
sought
    */
   public ResettableFileInputStream(File file, PositionTracker tracker)
       throws IOException {
@@ -91,12 +151,19 @@ public class ResettableFileInputStream extends 
ResettableInputStream
    *        PositionTracker implementation to make offset position durable
    *
    * @param bufSize
-   *        Size of the underlying buffer used for input
+   *        Size of the underlying buffer used for input. If lesser than 
{@link #MIN_BUF_SIZE},
+   *        a buffer of length {@link #MIN_BUF_SIZE} will be created instead.
    *
    * @param charset
    *        Character set used for decoding text, as necessary
    *
-   * @throws FileNotFoundException
+   * @param decodeErrorPolicy
+   *        A {@link DecodeErrorPolicy} instance to determine how
+   *        the decoder should behave in case of malformed input and/or
+   *        unmappable character.
+   *
+   * @throws FileNotFoundException If the file to read does not exist
+   * @throws IOException If the position reported by the tracker cannot be 
sought
    */
   public ResettableFileInputStream(File file, PositionTracker tracker,
       int bufSize, Charset charset, DecodeErrorPolicy decodeErrorPolicy)
@@ -105,16 +172,27 @@ public class ResettableFileInputStream extends 
ResettableInputStream
     this.tracker = tracker;
     this.in = new FileInputStream(file);
     this.chan = in.getChannel();
-    this.buf = ByteBuffer.allocateDirect(bufSize);
+    this.buf = ByteBuffer.allocateDirect(Math.max(bufSize, MIN_BUF_SIZE));
     buf.flip();
     this.byteBuf = new byte[1]; // single byte
-    this.charBuf = CharBuffer.allocate(1); // single char
+    this.charBuf = CharBuffer.allocate(2); // two chars for surrogate pairs
     charBuf.flip();
     this.fileSize = file.length();
     this.decoder = charset.newDecoder();
     this.position = 0;
     this.syncPosition = 0;
-    this.maxCharWidth = (int)Math.ceil(charset.newEncoder().maxBytesPerChar());
+    if(charset.name().startsWith("UTF-8")) {
+      // some JDKs wrongly report 3 bytes max
+      this.maxCharWidth = 4;
+    } else if(charset.name().startsWith("UTF-16")) {
+      // UTF_16BE and UTF_16LE wrongly report 2 bytes max
+      this.maxCharWidth = 4;
+    } else if(charset.name().startsWith("UTF-32")) {
+      // UTF_32BE and UTF_32LE wrongly report 4 bytes max
+      this.maxCharWidth = 8;
+    } else {
+      this.maxCharWidth = (int) 
Math.ceil(charset.newEncoder().maxBytesPerChar());
+    }
 
     CodingErrorAction errorAction;
     switch (decodeErrorPolicy) {
@@ -173,6 +251,14 @@ public class ResettableFileInputStream extends 
ResettableInputStream
 
   @Override
   public synchronized int readChar() throws IOException {
+
+    // Check whether we are in the middle of a surrogate pair,
+    // in which case, return the last (low surrogate) char of the pair.
+    if(hasLowSurrogate) {
+      hasLowSurrogate = false;
+      return lowSurrogate;
+    }
+
     // The decoder can have issues with multi-byte characters.
     // This check ensures that there are at least maxCharWidth bytes in the 
buffer
     // before reaching EOF.
@@ -184,6 +270,7 @@ public class ResettableFileInputStream extends 
ResettableInputStream
 
     int start = buf.position();
     charBuf.clear();
+    charBuf.limit(1);
 
     boolean isEndOfInput = false;
     if (position >= fileSize) {
@@ -198,20 +285,50 @@ public class ResettableFileInputStream extends 
ResettableInputStream
     int delta = buf.position() - start;
 
     charBuf.flip();
+
+    // Found a single char
     if (charBuf.hasRemaining()) {
       char c = charBuf.get();
-      // don't increment the persisted location if we are in between a
-      // surrogate pair, otherwise we may never recover if we seek() to this
-      // location!
-      incrPosition(delta, !Character.isHighSurrogate(c));
+      incrPosition(delta, true);
       return c;
+    }
 
-    // there may be a partial character in the decoder buffer
-    } else {
-      incrPosition(delta, false);
-      return -1;
+    // Found nothing, but the byte buffer has not been entirely consumed.
+    // This situation denotes the presence of a surrogate pair
+    // that can only be decoded if we have a 2-char buffer.
+    if(buf.hasRemaining()) {
+      charBuf.clear();
+      // increase the limit to 2
+      charBuf.limit(2);
+      // decode 2 chars in one pass
+      res = decoder.decode(buf, charBuf, isEndOfInput);
+      if (res.isMalformed() || res.isUnmappable()) {
+        res.throwException();
+      }
+      charBuf.flip();
+      // Check if we successfully decoded 2 chars
+      if (charBuf.remaining() == 2) {
+        char highSurrogate = charBuf.get();
+        // save second (low surrogate) char for later consumption
+        lowSurrogate = charBuf.get();
+        // Check if we really have a surrogate pair
+        if( ! Character.isHighSurrogate(highSurrogate) || ! 
Character.isLowSurrogate(lowSurrogate)) {
+          // This should only happen in case of bad sequences (dangling 
surrogate, etc.)
+          logger.warn("Decoded a pair of chars, but it does not seem to be a 
surrogate pair: {} {}", (int)highSurrogate, (int)lowSurrogate);
+        }
+        hasLowSurrogate = true;
+        // consider the pair as a single unit and increment position normally
+        delta = buf.position() - start;
+        incrPosition(delta, true);
+        // return the first (high surrogate) char of the pair
+        return highSurrogate;
+      }
     }
 
+    // end of file
+    incrPosition(delta, false);
+    return -1;
+
   }
 
   private void refillBuf() throws IOException {

http://git-wip-us.apache.org/repos/asf/flume/blob/344e0acc/flume-ng-core/src/test/java/org/apache/flume/serialization/TestResettableFileInputStream.java
----------------------------------------------------------------------
diff --git 
a/flume-ng-core/src/test/java/org/apache/flume/serialization/TestResettableFileInputStream.java
 
b/flume-ng-core/src/test/java/org/apache/flume/serialization/TestResettableFileInputStream.java
index d1240fb..2c559db 100644
--- 
a/flume-ng-core/src/test/java/org/apache/flume/serialization/TestResettableFileInputStream.java
+++ 
b/flume-ng-core/src/test/java/org/apache/flume/serialization/TestResettableFileInputStream.java
@@ -18,7 +18,6 @@
 package org.apache.flume.serialization;
 
 import com.google.common.base.Charsets;
-import com.google.common.base.Strings;
 import com.google.common.collect.Lists;
 import com.google.common.io.Files;
 import junit.framework.Assert;
@@ -29,7 +28,6 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import static org.junit.Assert.*;
-import static org.junit.Assert.assertEquals;
 
 import java.io.BufferedOutputStream;
 import java.io.ByteArrayOutputStream;
@@ -89,25 +87,75 @@ public class TestResettableFileInputStream {
     in.close();
   }
 
+
   /**
    * Ensure that we can process lines that contain multi byte characters in 
weird places
    * such as at the end of a buffer.
    * @throws IOException
    */
   @Test
-  public void testWideCharRead() throws IOException {
-    String output = wideCharFileInit(file, Charsets.UTF_8);
-
-    PositionTracker tracker = new DurablePositionTracker(meta, file.getPath());
-    ResettableInputStream in = new ResettableFileInputStream(file,  tracker);
+  public void testMultiByteCharRead() throws IOException {
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
+    out.write("1234567".getBytes(Charsets.UTF_8));
+    // write a multi byte char encompassing buffer boundaries
+    generateUtf83ByteSequence(out);
+    // buffer now contains 8 chars and 10 bytes total
+    Files.write(out.toByteArray(), file);
+    ResettableInputStream in = initInputStream(8, Charsets.UTF_8, 
DecodeErrorPolicy.FAIL);
+    String result = readLine(in, 8);
+    assertEquals("1234567\u0A93\n", result);
+  }
 
-    String result = readLine(in, output.length());
-    assertEquals(output, result);
+  /**
+   * Ensure that we can process UTF-8 lines that contain surrogate pairs
+   * even if they appear astride buffer boundaries.
+   * @throws IOException
+   */
+  @Test
+  public void testUtf8SurrogatePairRead() throws IOException {
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
+    out.write("1234567".getBytes(Charsets.UTF_8));
+    generateUtf8SurrogatePairSequence(out);
+    // buffer now contains 9 chars (7 "normal" and 2 surrogates) and 11 bytes 
total
+    // surrogate pair will encompass buffer boundaries
+    Files.write(out.toByteArray(), file);
+    ResettableInputStream in = initInputStream(8, Charsets.UTF_8, 
DecodeErrorPolicy.FAIL);
+    String result = readLine(in, 9);
+    assertEquals("1234567\uD83D\uDE18\n", result);
+  }
 
-    String afterEOF = readLine(in, output.length());
-    assertNull(afterEOF);
+  /**
+   * Ensure that we can process UTF-16 lines that contain surrogate pairs, even
+   * preceded by a Byte Order Mark (BOM).
+   * @throws IOException
+   */
+  @Test
+  public void testUtf16BOMAndSurrogatePairRead() throws IOException {
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
+    generateUtf16SurrogatePairSequence(out);
+    // buffer now contains 1 BOM and 2 chars (1 surrogate pair) and 6 bytes 
total (including 2-byte BOM)
+    Files.write(out.toByteArray(), file);
+    ResettableInputStream in = initInputStream(8, Charsets.UTF_16, 
DecodeErrorPolicy.FAIL);
+    String result = readLine(in, 2);
+    assertEquals("\uD83D\uDE18\n", result);
+  }
 
-    in.close();
+  /**
+   * Ensure that we can process Shift_JIS lines that contain multi byte 
Japanese chars
+   * even if they appear astride buffer boundaries.
+   * @throws IOException
+   */
+  @Test
+  public void testShiftJisSurrogateCharRead() throws IOException {
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
+    out.write("1234567".getBytes(Charset.forName("Shift_JIS")));
+    // write a multi byte char encompassing buffer boundaries
+    generateShiftJis2ByteSequence(out);
+    // buffer now contains 8 chars and 10 bytes total
+    Files.write(out.toByteArray(), file);
+    ResettableInputStream in = initInputStream(8, 
Charset.forName("Shift_JIS"), DecodeErrorPolicy.FAIL);
+    String result = readLine(in, 8);
+    assertEquals("1234567\u4E9C\n", result);
   }
 
   @Test(expected = MalformedInputException.class)
@@ -178,60 +226,6 @@ public class TestResettableFileInputStream {
     assertEquals("Invalid: (X)\n".replaceAll("X", "\ufffd"), sb.toString());
   }
 
-  private ResettableInputStream initUtf8DecodeTest(DecodeErrorPolicy policy)
-      throws IOException {
-    writeBigBadUtf8Sequence(file);
-    return initInputStream(policy);
-  }
-
-  private ResettableInputStream initInputStream(DecodeErrorPolicy policy)
-      throws IOException {
-    PositionTracker tracker = new DurablePositionTracker(meta, file.getPath());
-    ResettableInputStream in = new ResettableFileInputStream(file, tracker,
-        2048, Charsets.UTF_8, policy);
-    return in;
-  }
-
-  private void writeBigBadUtf8Sequence(File file) throws IOException {
-    ByteArrayOutputStream out = new ByteArrayOutputStream();
-    generateUtf8Latin1Sequence(out);
-    generateUtf8OverlyLongSequence(out);
-    generateUtf8NonUnicodeSequence(out);
-    Files.write(out.toByteArray(), file);
-  }
-
-  private void generateUtf8OverlyLongSequence(OutputStream out)
-      throws IOException {
-    out.write("Long: (".getBytes(Charsets.UTF_8));
-    // Overly-long slash character should not be accepted.
-    out.write(new byte[] { (byte)0xe0, (byte)0x80, (byte)0xaf });
-    out.write(")\n".getBytes(Charsets.UTF_8));
-  }
-
-  private void generateUtf8NonUnicodeSequence(OutputStream out)
-      throws IOException {
-    out.write("NonUnicode: (".getBytes(Charsets.UTF_8));
-    // This is a valid 5-octet sequence but is not Unicode
-    out.write(new byte[] { (byte)0xf8, (byte)0xa1, (byte)0xa1, (byte)0xa1,
-        (byte)0xa1 } );
-    out.write(")\n".getBytes(Charsets.UTF_8));
-  }
-
-  private void generateUtf8Latin1Sequence(OutputStream out) throws IOException 
{
-    out.write("Latin1: (".getBytes(Charsets.UTF_8));
-    // This is "e" with an accent in Latin-1
-    out.write(new byte[] { (byte)0xe9 } );
-    out.write(")\n".getBytes(Charsets.UTF_8));
-  }
-
-  private void generateLatin1InvalidSequence(OutputStream out)
-      throws IOException {
-    out.write("Invalid: (".getBytes(Charsets.UTF_8));
-    // Not a valid character in Latin 1.
-    out.write(new byte[] { (byte)0x81 } );
-    out.write(")\n".getBytes(Charsets.UTF_8));
-  }
-
   /**
    * Ensure a reset() brings us back to the default mark (beginning of file)
    * @throws IOException
@@ -291,6 +285,55 @@ public class TestResettableFileInputStream {
     in.close();
   }
 
+  /**
+   * Ensure that surrogate pairs work well with mark/reset.
+   * @throws IOException
+   */
+  @Test
+  public void testMarkResetWithSurrogatePairs() throws IOException {
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
+    out.write("foo".getBytes(Charsets.UTF_8));
+    generateUtf8SurrogatePairSequence(out);
+    out.write("bar".getBytes(Charsets.UTF_8));
+    Files.write(out.toByteArray(), file);
+    PositionTracker tracker = new DurablePositionTracker(meta, file.getPath());
+    ResettableInputStream in = new ResettableFileInputStream(file, tracker);
+    Assert.assertEquals('f', in.readChar());
+    Assert.assertEquals('o', in.readChar());
+    in.mark();
+    Assert.assertEquals('o', in.readChar());
+    // read high surrogate
+    Assert.assertEquals('\ud83d', in.readChar());
+    // call reset in the middle of a surrogate pair
+    in.reset();
+    // will read low surrogate *before* reverting back to mark, to ensure
+    // surrogate pair is properly read
+    Assert.assertEquals('\ude18', in.readChar());
+    // now back to marked position
+    Assert.assertEquals('o', in.readChar());
+    // read high surrogate again
+    Assert.assertEquals('\ud83d', in.readChar());
+    // call mark in the middle of a surrogate pair:
+    // will mark the position *after* the pair, *not* low surrogate's position
+    in.mark();
+    // will reset to the position *after* the pair
+    in.reset();
+    // read low surrogate normally despite of reset being called
+    // so that the pair is entirely read
+    Assert.assertEquals('\ude18', in.readChar());
+    Assert.assertEquals('b', in.readChar());
+    Assert.assertEquals('a', in.readChar());
+    // will reset to the position *after* the pair
+    in.reset();
+    Assert.assertEquals('b', in.readChar());
+    Assert.assertEquals('a', in.readChar());
+    Assert.assertEquals('r', in.readChar());
+    Assert.assertEquals(-1, in.readChar());
+    in.close();
+    tracker.close(); // redundant
+  }
+
+
   @Test
   public void testResume() throws IOException {
     List<String> expected = multiLineFileInit(file, Charsets.UTF_8);
@@ -322,6 +365,62 @@ public class TestResettableFileInputStream {
     Assert.assertEquals(result3, result3a);
   }
 
+  /**
+   * Ensure that surrogate pairs work well when resuming
+   * reading. Specifically, this test brings up special situations
+   * where a surrogate pair cannot be correctly decoded because
+   * the second character is lost.
+   *
+   * @throws IOException
+   */
+  @Test
+  public void testResumeWithSurrogatePairs() throws IOException {
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
+    out.write("foo".getBytes(Charsets.UTF_8));
+    generateUtf8SurrogatePairSequence(out);
+    out.write("bar".getBytes(Charsets.UTF_8));
+    Files.write(out.toByteArray(), file);
+    PositionTracker tracker = new DurablePositionTracker(meta, file.getPath());
+    ResettableInputStream in = new ResettableFileInputStream(file, tracker);
+    Assert.assertEquals('f', in.readChar());
+    Assert.assertEquals('o', in.readChar());
+    in.mark();
+    Assert.assertEquals('o', in.readChar());
+    // read high surrogate
+    Assert.assertEquals('\ud83d', in.readChar());
+    // call reset in the middle of a surrogate pair
+    in.reset();
+    // close RIS - this will cause the low surrogate char
+    // stored in-memory to be lost
+    in.close();
+    tracker.close(); // redundant
+    // create new Tracker & RIS
+    tracker = new DurablePositionTracker(meta, file.getPath());
+    in = new ResettableFileInputStream(file, tracker);
+    // low surrogate char is now lost - resume from marked position
+    Assert.assertEquals('o', in.readChar());
+    // read high surrogate again
+    Assert.assertEquals('\ud83d', in.readChar());
+    // call mark in the middle of a surrogate pair:
+    // will mark the position *after* the pair, *not* low surrogate's position
+    in.mark();
+    // close RIS - this will cause the low surrogate char
+    // stored in-memory to be lost
+    in.close();
+    tracker.close(); // redundant
+    // create new Tracker & RIS
+    tracker = new DurablePositionTracker(meta, file.getPath());
+    in = new ResettableFileInputStream(file, tracker);
+    // low surrogate char is now lost - resume from marked position
+    Assert.assertEquals('b', in.readChar());
+    Assert.assertEquals('a', in.readChar());
+    Assert.assertEquals('r', in.readChar());
+    Assert.assertEquals(-1, in.readChar());
+    in.close();
+    tracker.close(); // redundant
+  }
+
+
   @Test
   public void testSeek() throws IOException {
     int NUM_LINES = 1000;
@@ -375,28 +474,85 @@ public class TestResettableFileInputStream {
     assertEquals(11, Integer.parseInt(readLine(in, LINE_LEN).substring(0, 
10)));
   }
 
-  /**
-   * Helper method that generates a line to test if parts of multi-byte 
characters on the
-   * edge of a buffer are handled properly.
-   */
-  private static String generateWideCharLine(){
-    String s = "Ã©llo WÃ¶rld!\n";
-    int size = (ResettableFileInputStream.DEFAULT_BUF_SIZE - 1) + s.length();
-    return Strings.padStart(s, size , 'H');
+  private ResettableInputStream initUtf8DecodeTest(DecodeErrorPolicy policy)
+      throws IOException {
+    writeBigBadUtf8Sequence(file);
+    return initInputStream(policy);
   }
 
-  /**
-   * Creates a file that contains a line that contains wide characters
-   * @param file
-   * @param charset
-   * @return
-   * @throws IOException
-   */
-  private static String wideCharFileInit(File file, Charset charset)
+  private ResettableInputStream initInputStream(DecodeErrorPolicy policy)
       throws IOException {
-    String output = generateWideCharLine();
-    Files.write(output.getBytes(charset), file);
-    return output;
+    return initInputStream(2048, Charsets.UTF_8, policy);
+  }
+
+  private ResettableInputStream initInputStream(int bufferSize, Charset 
charset, DecodeErrorPolicy policy)
+      throws IOException {
+    PositionTracker tracker = new DurablePositionTracker(meta, file.getPath());
+    ResettableInputStream in = new ResettableFileInputStream(file, tracker,
+        bufferSize, charset, policy);
+    return in;
+  }
+
+  private void writeBigBadUtf8Sequence(File file) throws IOException {
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
+    generateUtf8Latin1Sequence(out);
+    generateUtf8OverlyLongSequence(out);
+    generateUtf8NonUnicodeSequence(out);
+    Files.write(out.toByteArray(), file);
+  }
+
+  private void generateUtf8OverlyLongSequence(OutputStream out)
+      throws IOException {
+    out.write("Long: (".getBytes(Charsets.UTF_8));
+    // Overly-long slash character should not be accepted.
+    out.write(new byte[] { (byte)0xe0, (byte)0x80, (byte)0xaf });
+    out.write(")\n".getBytes(Charsets.UTF_8));
+  }
+
+  private void generateUtf8NonUnicodeSequence(OutputStream out)
+      throws IOException {
+    out.write("NonUnicode: (".getBytes(Charsets.UTF_8));
+    // This is a valid 5-octet sequence but is not Unicode
+    out.write(new byte[]{(byte) 0xf8, (byte) 0xa1, (byte) 0xa1, (byte) 0xa1,
+        (byte) 0xa1});
+    out.write(")\n".getBytes(Charsets.UTF_8));
+  }
+
+  private void generateUtf8Latin1Sequence(OutputStream out) throws IOException 
{
+    out.write("Latin1: (".getBytes(Charsets.UTF_8));
+    // This is "e" with an accent in Latin-1
+    out.write(new byte[] { (byte)0xe9 } );
+    out.write(")\n".getBytes(Charsets.UTF_8));
+  }
+
+  private void generateLatin1InvalidSequence(OutputStream out)
+      throws IOException {
+    out.write("Invalid: (".getBytes(Charsets.UTF_8));
+    // Not a valid character in Latin 1.
+    out.write(new byte[] { (byte)0x81 } );
+    out.write(")\n".getBytes(Charsets.UTF_8));
+  }
+
+  private void generateUtf8SurrogatePairSequence(OutputStream out) throws 
IOException {
+    // U+1F618 (UTF-8: f0 9f 98 98) FACE THROWING A KISS
+    out.write(new byte[]{(byte) 0xF0, (byte) 0x9F, (byte) 0x98, (byte) 0x98});
+  }
+
+  private void generateUtf16SurrogatePairSequence(OutputStream out) throws 
IOException {
+    // BOM
+    out.write(new byte[]{(byte) 0xFE, (byte) 0xFF});
+    // U+1F618 (UTF-16: d83d de18) FACE THROWING A KISS
+    out.write(new byte[]{(byte) 0xD8, (byte) 0x3D, (byte) 0xDE, (byte) 0x18});
+  }
+
+  private void generateUtf83ByteSequence(OutputStream out) throws IOException {
+    // U+0A93 (UTF-8: e0 aa 93) GUJARATI LETTER O
+    out.write(new byte[]{(byte) 0xe0, (byte) 0xaa, (byte) 0x93});
+  }
+
+  private void generateShiftJis2ByteSequence(OutputStream out) throws 
IOException {
+    //U+4E9C (Shift JIS: 88 9f) CJK UNIFIED IDEOGRAPH
+    out.write(new byte[]{(byte) 0x88, (byte) 0x9f});
   }
 
   /**

flume git commit: FLUME-2215. ResettableFileInputStream can't support ucs-4 character

Reply via email to