DRILL-3178: csv reader should allow newlines inside quotes

This closes #593


Project: http://git-wip-us.apache.org/repos/asf/drill/repo
Commit: http://git-wip-us.apache.org/repos/asf/drill/commit/42948feb
Tree: http://git-wip-us.apache.org/repos/asf/drill/tree/42948feb
Diff: http://git-wip-us.apache.org/repos/asf/drill/diff/42948feb

Branch: refs/heads/master
Commit: 42948feb4a45f98f3d116d2e2a765cc3fadb5937
Parents: 89f2633
Author: Francois Methot <fmetho...@gmail.com>
Authored: Tue Sep 20 20:16:15 2016 -0400
Committer: Parth Chandra <par...@apache.org>
Committed: Tue Oct 18 10:47:52 2016 -0700

----------------------------------------------------------------------
 .../store/easy/text/compliant/TextInput.java    | 60 ++++++++++++--------
 .../store/easy/text/compliant/TextReader.java   |  4 +-
 .../exec/store/text/TestNewTextReader.java      | 19 +++++++
 .../resources/store/text/WithQuotedCrLf.tbl     |  6 ++
 4 files changed, 62 insertions(+), 27 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/drill/blob/42948feb/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/TextInput.java
----------------------------------------------------------------------
diff --git 
a/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/TextInput.java
 
b/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/TextInput.java
index 1bb2d30..971bb9b 100644
--- 
a/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/TextInput.java
+++ 
b/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/TextInput.java
@@ -261,6 +261,41 @@ final class TextInput {
    * @throws IOException
    */
   public final byte nextChar() throws IOException {
+    byte byteChar = nextCharNoNewLineCheck();
+    int bufferPtrTemp = bufferPtr - 1;
+    if (byteChar == lineSeparator[0]) {
+       for (int i = 1; i < lineSeparator.length; i++, bufferPtrTemp++) {
+         if (lineSeparator[i] != buffer.getByte(bufferPtrTemp)) {
+           return byteChar;
+         }
+       }
+
+        lineCount++;
+        byteChar = normalizedLineSeparator;
+
+        // we don't need to update buffer position if line separator is one 
byte long
+        if (lineSeparator.length > 1) {
+          bufferPtr += (lineSeparator.length - 1);
+          if (bufferPtr >= length) {
+            if (length != -1) {
+              updateBuffer();
+            } else {
+              throw StreamFinishedPseudoException.INSTANCE;
+            }
+          }
+        }
+      }
+
+    return byteChar;
+  }
+
+  /**
+   * Get next byte from stream.  Do no maintain any line count  Will throw a 
StreamFinishedPseudoException
+   * when the stream has run out of bytes.
+   * @return next byte from stream.
+   * @throws IOException
+   */
+  public final byte nextCharNoNewLineCheck() throws IOException {
 
     if (length == -1) {
       throw StreamFinishedPseudoException.INSTANCE;
@@ -283,31 +318,6 @@ final class TextInput {
 
     bufferPtr++;
 
-    // monitor for next line.
-    int bufferPtrTemp = bufferPtr - 1;
-    if (byteChar == lineSeparator[0]) {
-      for (int i = 1; i < lineSeparator.length; i++, bufferPtrTemp++) {
-        if (lineSeparator[i] !=  buffer.getByte(bufferPtrTemp)) {
-          return byteChar;
-        }
-      }
-
-      lineCount++;
-      byteChar = normalizedLineSeparator;
-
-      // we don't need to update buffer position if line separator is one byte 
long
-      if (lineSeparator.length > 1) {
-        bufferPtr += (lineSeparator.length - 1);
-        if (bufferPtr >= length) {
-          if (length != -1) {
-            updateBuffer();
-          } else {
-            throw StreamFinishedPseudoException.INSTANCE;
-          }
-        }
-      }
-    }
-
     return byteChar;
   }
 

http://git-wip-us.apache.org/repos/asf/drill/blob/42948feb/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/TextReader.java
----------------------------------------------------------------------
diff --git 
a/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/TextReader.java
 
b/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/TextReader.java
index 82427bb..d218846 100644
--- 
a/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/TextReader.java
+++ 
b/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/TextReader.java
@@ -231,7 +231,7 @@ final class TextReader {
     final TextInput input = this.input;
     final byte quote = this.quote;
 
-    ch = input.nextChar();
+    ch = input.nextCharNoNewLineCheck();
 
     while (!(prev == quote && (ch == delimiter || ch == newLine || 
isWhite(ch)))) {
       if (ch != quote) {
@@ -257,7 +257,7 @@ final class TextReader {
       } else {
         prev = ch;
       }
-      ch = input.nextChar();
+      ch = input.nextCharNoNewLineCheck();
     }
 
     // Handles whitespaces after quoted value:

http://git-wip-us.apache.org/repos/asf/drill/blob/42948feb/exec/java-exec/src/test/java/org/apache/drill/exec/store/text/TestNewTextReader.java
----------------------------------------------------------------------
diff --git 
a/exec/java-exec/src/test/java/org/apache/drill/exec/store/text/TestNewTextReader.java
 
b/exec/java-exec/src/test/java/org/apache/drill/exec/store/text/TestNewTextReader.java
index 6b8e16a..d7fb963 100644
--- 
a/exec/java-exec/src/test/java/org/apache/drill/exec/store/text/TestNewTextReader.java
+++ 
b/exec/java-exec/src/test/java/org/apache/drill/exec/store/text/TestNewTextReader.java
@@ -25,6 +25,7 @@ import org.apache.drill.BaseTestQuery;
 import org.apache.drill.common.exceptions.UserRemoteException;
 import org.apache.drill.common.util.FileUtils;
 import org.apache.drill.exec.proto.UserBitShared.DrillPBError.ErrorType;
+import org.junit.Ignore;
 import org.junit.Test;
 
 public class TestNewTextReader extends BaseTestQuery {
@@ -39,6 +40,7 @@ public class TestNewTextReader extends BaseTestQuery {
         .go();
   }
 
+  @Ignore ("Not needed any more. (DRILL-3178)")
   @Test
   public void ensureFailureOnNewLineDelimiterWithinQuotes() {
     try {
@@ -112,4 +114,21 @@ public class TestNewTextReader extends BaseTestQuery {
         .build()
         .run();
   }
+
+  @Test // see DRILL-3718
+  public void testCrLfSeparatedWithQuote() throws Exception {
+    final String root = 
FileUtils.getResourceAsFile("/store/text/WithQuotedCrLf.tbl").toURI().toString();
+    final String query = String.format("select columns[0] as c0, columns[1] as 
c1, columns[2] as c2 \n" +
+        "from dfs_test.`%s` ", root);
+
+    testBuilder()
+        .sqlQuery(query)
+        .unOrdered()
+        .baselineColumns("c0", "c1", "c2")
+        .baselineValues("a\n1", "a", "a")
+        .baselineValues("a", "a\n2", "a")
+        .baselineValues("a", "a", "a\n3")
+        .build()
+        .run();
+  }
 }

http://git-wip-us.apache.org/repos/asf/drill/blob/42948feb/exec/java-exec/src/test/resources/store/text/WithQuotedCrLf.tbl
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/test/resources/store/text/WithQuotedCrLf.tbl 
b/exec/java-exec/src/test/resources/store/text/WithQuotedCrLf.tbl
new file mode 100644
index 0000000..5d2f081
--- /dev/null
+++ b/exec/java-exec/src/test/resources/store/text/WithQuotedCrLf.tbl
@@ -0,0 +1,6 @@
+"a
+1"|a|a
+a|"a
+2"|a
+a|a|"a
+3"
\ No newline at end of file

Reply via email to