DRILL-972: Add newline if no delimiter is specified; avoid rescanning (contains DRILL-950).
Project: http://git-wip-us.apache.org/repos/asf/incubator-drill/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-drill/commit/d84190f2 Tree: http://git-wip-us.apache.org/repos/asf/incubator-drill/tree/d84190f2 Diff: http://git-wip-us.apache.org/repos/asf/incubator-drill/diff/d84190f2 Branch: refs/heads/master Commit: d84190f22c57e78f2e236afa5f5b03b179b031d9 Parents: dbaec9d Author: Sudheesh Katkam <skat...@maprtech.com> Authored: Tue Jun 17 14:21:38 2014 -0700 Committer: Jacques Nadeau <jacq...@apache.org> Committed: Wed Jun 25 17:54:48 2014 -0700 ---------------------------------------------------------------------- .../exec/store/easy/text/TextFormatPlugin.java | 2 +- .../exec/store/text/DrillTextRecordReader.java | 14 +++- .../drill/exec/store/text/TestTextColumn.java | 73 ++++++++++++++++++++ .../resources/bootstrap-storage-plugins.json | 4 ++ .../test/resources/store/text/data/letters.csv | 3 + .../test/resources/store/text/data/letters.txt | 3 + pom.xml | 1 + 7 files changed, 96 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/d84190f2/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/TextFormatPlugin.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/TextFormatPlugin.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/TextFormatPlugin.java index 3935008..81a23b0 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/TextFormatPlugin.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/TextFormatPlugin.java @@ -100,7 +100,7 @@ public class TextFormatPlugin extends EasyFormatPlugin<TextFormatPlugin.TextForm public static class TextFormatConfig implements FormatPluginConfig { public List<String> extensions; - public String delimiter; + public String delimiter = "\n"; public List<String> getExtensions() { return extensions; http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/d84190f2/exec/java-exec/src/main/java/org/apache/drill/exec/store/text/DrillTextRecordReader.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/text/DrillTextRecordReader.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/text/DrillTextRecordReader.java index b5b5b3c..5bf571d 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/text/DrillTextRecordReader.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/text/DrillTextRecordReader.java @@ -126,9 +126,13 @@ public class DrillTextRecordReader implements RecordReader { break; } start = end; - end = find(value, delimiter, start + 1); - if (end == -1) { + if (delimiter == '\n') { end = value.getLength(); + } else { + end = find(value, delimiter, start + 1); + if (end == -1) { + end = value.getLength(); + } } if (numCols > 0 && i++ < columnIds.get(p)) { if (!vector.getMutator().addSafe(recordCount, value.getBytes(), start + 1, 0)) { @@ -166,8 +170,12 @@ public class DrillTextRecordReader implements RecordReader { int len = text.getLength(); int p = start; byte[] bytes = text.getBytes(); + boolean inQuotes = false; while (p < len) { - if (bytes[p] == what) { + if ('\"' == bytes[p]) { + inQuotes = !inQuotes; + } + if (!inQuotes && bytes[p] == what) { return p; } p++; http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/d84190f2/exec/java-exec/src/test/java/org/apache/drill/exec/store/text/TestTextColumn.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/store/text/TestTextColumn.java b/exec/java-exec/src/test/java/org/apache/drill/exec/store/text/TestTextColumn.java index fb729d5..395ec02 100644 --- a/exec/java-exec/src/test/java/org/apache/drill/exec/store/text/TestTextColumn.java +++ b/exec/java-exec/src/test/java/org/apache/drill/exec/store/text/TestTextColumn.java @@ -18,8 +18,19 @@ package org.apache.drill.exec.store.text; import org.apache.drill.BaseTestQuery; +import org.apache.drill.exec.exception.SchemaChangeException; +import org.apache.drill.exec.record.RecordBatchLoader; +import org.apache.drill.exec.record.VectorWrapper; +import org.apache.drill.exec.rpc.user.QueryResultBatch; +import org.apache.drill.exec.vector.ValueVector; import org.junit.Test; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static org.junit.Assert.assertEquals; + public class TestTextColumn extends BaseTestQuery{ static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(TestTextColumn.class); @@ -27,4 +38,66 @@ public class TestTextColumn extends BaseTestQuery{ public void testCsvColumnSelection() throws Exception{ test("select columns[0] as region_id, columns[1] as country from dfs.`[WORKING_PATH]/src/test/resources/store/text/data/regions.csv`"); } + + @Test + public void testDefaultDelimiterColumnSelection() throws Exception { + List<QueryResultBatch> batches = testSqlWithResults("SELECT columns[0] as entire_row " + + "from dfs.`[WORKING_PATH]/src/test/resources/store/text/data/letters.txt`"); + + List<List<String>> expectedOutput = Arrays.asList( + Arrays.asList("\"a, b,\",\"c\",\"d,, \\n e\""), + Arrays.asList("\"d, e,\",\"f\",\"g,, \\n h\""), + Arrays.asList("\"g, h,\",\"i\",\"j,, \\n k\"")); + + List<List<String>> actualOutput = getOutput(batches); + validateOutput(expectedOutput, actualOutput); + } + + @Test + public void testCsvColumnSelectionCommasInsideQuotes() throws Exception { + List<QueryResultBatch> batches = testSqlWithResults("SELECT columns[0] as col1, columns[1] as col2, columns[2] as col3," + + "columns[3] as col4 from dfs.`[WORKING_PATH]/src/test/resources/store/text/data/letters.csv`"); + + List<List<String>> expectedOutput = Arrays.asList( + Arrays.asList("\"a, b,\"", "\"c\"", "\"d,, \\n e\"","\"f\\\"g\""), + Arrays.asList("\"d, e,\"", "\"f\"", "\"g,, \\n h\"","\"i\\\"j\""), + Arrays.asList("\"g, h,\"", "\"i\"", "\"j,, \\n k\"","\"l\\\"m\"")); + + List<List<String>> actualOutput = getOutput(batches); + validateOutput(expectedOutput, actualOutput); + } + + private List<List<String>> getOutput(List<QueryResultBatch> batches) throws SchemaChangeException { + List<List<String>> output = new ArrayList<>(); + RecordBatchLoader loader = new RecordBatchLoader(getAllocator()); + int last = 0; + for(QueryResultBatch batch : batches) { + int rows = batch.getHeader().getRowCount(); + if(batch.getData() != null) { + loader.load(batch.getHeader().getDef(), batch.getData()); + for (int i = 0; i < rows; ++i) { + output.add(new ArrayList<String>()); + for (VectorWrapper<?> vw: loader) { + ValueVector.Accessor accessor = vw.getValueVector().getAccessor(); + output.get(last).add(accessor.getObject(i).toString()); + } + ++last; + } + } + loader.clear(); + batch.release(); + } + return output; + } + + private void validateOutput(List<List<String>> expected, List<List<String>> actual) { + assertEquals(expected.size(), actual.size()); + for (int i = 0 ; i < expected.size(); ++i) { + assertEquals(expected.get(i).size(), actual.get(i).size()); + for (int j = 0; j < expected.get(i).size(); ++j) { + assertEquals(expected.get(i).get(j), actual.get(i).get(j)); + } + } + } + } http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/d84190f2/exec/java-exec/src/test/resources/bootstrap-storage-plugins.json ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/resources/bootstrap-storage-plugins.json b/exec/java-exec/src/test/resources/bootstrap-storage-plugins.json index 9503482..534b120 100644 --- a/exec/java-exec/src/test/resources/bootstrap-storage-plugins.json +++ b/exec/java-exec/src/test/resources/bootstrap-storage-plugins.json @@ -35,6 +35,10 @@ }, "parquet" : { type: "parquet" + }, + "txt" : { + type : "text", + extensions: [ "txt" ] } } }, http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/d84190f2/exec/java-exec/src/test/resources/store/text/data/letters.csv ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/resources/store/text/data/letters.csv b/exec/java-exec/src/test/resources/store/text/data/letters.csv new file mode 100644 index 0000000..4d724e8 --- /dev/null +++ b/exec/java-exec/src/test/resources/store/text/data/letters.csv @@ -0,0 +1,3 @@ +"a, b,","c","d,, \n e","f\"g" +"d, e,","f","g,, \n h","i\"j" +"g, h,","i","j,, \n k","l\"m" \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/d84190f2/exec/java-exec/src/test/resources/store/text/data/letters.txt ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/resources/store/text/data/letters.txt b/exec/java-exec/src/test/resources/store/text/data/letters.txt new file mode 100644 index 0000000..14b9cb6 --- /dev/null +++ b/exec/java-exec/src/test/resources/store/text/data/letters.txt @@ -0,0 +1,3 @@ +"a, b,","c","d,, \n e" +"d, e,","f","g,, \n h" +"g, h,","i","j,, \n k" \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/d84190f2/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 7dbdef0..129cc22 100644 --- a/pom.xml +++ b/pom.xml @@ -128,6 +128,7 @@ <exclude>**/*.sql</exclude> <exclude>**/git.properties</exclude> <exclude>**/*.csv</exclude> + <exclude>**/*.txt</exclude> <exclude>**/drill-*.conf</exclude> <exclude>**/.buildpath</exclude> <exclude>**/*.proto</exclude>