DRILL-3178: csv reader should allow newlines inside quotes This closes #593
Project: http://git-wip-us.apache.org/repos/asf/drill/repo Commit: http://git-wip-us.apache.org/repos/asf/drill/commit/42948feb Tree: http://git-wip-us.apache.org/repos/asf/drill/tree/42948feb Diff: http://git-wip-us.apache.org/repos/asf/drill/diff/42948feb Branch: refs/heads/master Commit: 42948feb4a45f98f3d116d2e2a765cc3fadb5937 Parents: 89f2633 Author: FranÂcois Methot <fmetho...@gmail.com> Authored: Tue Sep 20 20:16:15 2016 -0400 Committer: Parth Chandra <par...@apache.org> Committed: Tue Oct 18 10:47:52 2016 -0700 ---------------------------------------------------------------------- .../store/easy/text/compliant/TextInput.java | 60 ++++++++++++-------- .../store/easy/text/compliant/TextReader.java | 4 +- .../exec/store/text/TestNewTextReader.java | 19 +++++++ .../resources/store/text/WithQuotedCrLf.tbl | 6 ++ 4 files changed, 62 insertions(+), 27 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/drill/blob/42948feb/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/TextInput.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/TextInput.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/TextInput.java index 1bb2d30..971bb9b 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/TextInput.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/TextInput.java @@ -261,6 +261,41 @@ final class TextInput { * @throws IOException */ public final byte nextChar() throws IOException { + byte byteChar = nextCharNoNewLineCheck(); + int bufferPtrTemp = bufferPtr - 1; + if (byteChar == lineSeparator[0]) { + for (int i = 1; i < lineSeparator.length; i++, bufferPtrTemp++) { + if (lineSeparator[i] != buffer.getByte(bufferPtrTemp)) { + return byteChar; + } + } + + lineCount++; + byteChar = normalizedLineSeparator; + + // we don't need to update buffer position if line separator is one byte long + if (lineSeparator.length > 1) { + bufferPtr += (lineSeparator.length - 1); + if (bufferPtr >= length) { + if (length != -1) { + updateBuffer(); + } else { + throw StreamFinishedPseudoException.INSTANCE; + } + } + } + } + + return byteChar; + } + + /** + * Get next byte from stream. Do no maintain any line count Will throw a StreamFinishedPseudoException + * when the stream has run out of bytes. + * @return next byte from stream. + * @throws IOException + */ + public final byte nextCharNoNewLineCheck() throws IOException { if (length == -1) { throw StreamFinishedPseudoException.INSTANCE; @@ -283,31 +318,6 @@ final class TextInput { bufferPtr++; - // monitor for next line. - int bufferPtrTemp = bufferPtr - 1; - if (byteChar == lineSeparator[0]) { - for (int i = 1; i < lineSeparator.length; i++, bufferPtrTemp++) { - if (lineSeparator[i] != buffer.getByte(bufferPtrTemp)) { - return byteChar; - } - } - - lineCount++; - byteChar = normalizedLineSeparator; - - // we don't need to update buffer position if line separator is one byte long - if (lineSeparator.length > 1) { - bufferPtr += (lineSeparator.length - 1); - if (bufferPtr >= length) { - if (length != -1) { - updateBuffer(); - } else { - throw StreamFinishedPseudoException.INSTANCE; - } - } - } - } - return byteChar; } http://git-wip-us.apache.org/repos/asf/drill/blob/42948feb/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/TextReader.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/TextReader.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/TextReader.java index 82427bb..d218846 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/TextReader.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/TextReader.java @@ -231,7 +231,7 @@ final class TextReader { final TextInput input = this.input; final byte quote = this.quote; - ch = input.nextChar(); + ch = input.nextCharNoNewLineCheck(); while (!(prev == quote && (ch == delimiter || ch == newLine || isWhite(ch)))) { if (ch != quote) { @@ -257,7 +257,7 @@ final class TextReader { } else { prev = ch; } - ch = input.nextChar(); + ch = input.nextCharNoNewLineCheck(); } // Handles whitespaces after quoted value: http://git-wip-us.apache.org/repos/asf/drill/blob/42948feb/exec/java-exec/src/test/java/org/apache/drill/exec/store/text/TestNewTextReader.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/store/text/TestNewTextReader.java b/exec/java-exec/src/test/java/org/apache/drill/exec/store/text/TestNewTextReader.java index 6b8e16a..d7fb963 100644 --- a/exec/java-exec/src/test/java/org/apache/drill/exec/store/text/TestNewTextReader.java +++ b/exec/java-exec/src/test/java/org/apache/drill/exec/store/text/TestNewTextReader.java @@ -25,6 +25,7 @@ import org.apache.drill.BaseTestQuery; import org.apache.drill.common.exceptions.UserRemoteException; import org.apache.drill.common.util.FileUtils; import org.apache.drill.exec.proto.UserBitShared.DrillPBError.ErrorType; +import org.junit.Ignore; import org.junit.Test; public class TestNewTextReader extends BaseTestQuery { @@ -39,6 +40,7 @@ public class TestNewTextReader extends BaseTestQuery { .go(); } + @Ignore ("Not needed any more. (DRILL-3178)") @Test public void ensureFailureOnNewLineDelimiterWithinQuotes() { try { @@ -112,4 +114,21 @@ public class TestNewTextReader extends BaseTestQuery { .build() .run(); } + + @Test // see DRILL-3718 + public void testCrLfSeparatedWithQuote() throws Exception { + final String root = FileUtils.getResourceAsFile("/store/text/WithQuotedCrLf.tbl").toURI().toString(); + final String query = String.format("select columns[0] as c0, columns[1] as c1, columns[2] as c2 \n" + + "from dfs_test.`%s` ", root); + + testBuilder() + .sqlQuery(query) + .unOrdered() + .baselineColumns("c0", "c1", "c2") + .baselineValues("a\n1", "a", "a") + .baselineValues("a", "a\n2", "a") + .baselineValues("a", "a", "a\n3") + .build() + .run(); + } } http://git-wip-us.apache.org/repos/asf/drill/blob/42948feb/exec/java-exec/src/test/resources/store/text/WithQuotedCrLf.tbl ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/resources/store/text/WithQuotedCrLf.tbl b/exec/java-exec/src/test/resources/store/text/WithQuotedCrLf.tbl new file mode 100644 index 0000000..5d2f081 --- /dev/null +++ b/exec/java-exec/src/test/resources/store/text/WithQuotedCrLf.tbl @@ -0,0 +1,6 @@ +"a +1"|a|a +a|"a +2"|a +a|a|"a +3" \ No newline at end of file