Updated Branches: refs/heads/master 98bc9e19c -> fef22041b
DRILL 211 - index out of bounds error in parquet reader. Project: http://git-wip-us.apache.org/repos/asf/incubator-drill/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-drill/commit/60e2080f Tree: http://git-wip-us.apache.org/repos/asf/incubator-drill/tree/60e2080f Diff: http://git-wip-us.apache.org/repos/asf/incubator-drill/diff/60e2080f Branch: refs/heads/master Commit: 60e2080fa557cddfe8146a706612444724efe716 Parents: 98bc9e1 Author: Jason Altekruse <[email protected]> Authored: Fri Sep 6 00:53:50 2013 -0500 Committer: Jacques Nadeau <[email protected]> Committed: Thu Sep 5 23:01:15 2013 -0700 ---------------------------------------------------------------------- .../exec/store/parquet/VarLenBinaryReader.java | 5 -- .../exec/store/ParquetRecordReaderTest.java | 49 +++++++++++++++++++- 2 files changed, 48 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/60e2080f/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/VarLenBinaryReader.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/VarLenBinaryReader.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/VarLenBinaryReader.java index f20a2f3..3286314 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/VarLenBinaryReader.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/VarLenBinaryReader.java @@ -116,7 +116,6 @@ public class VarLenBinaryReader { columnReader.dataTypeLengthInBits = BytesUtils.readIntLittleEndian(bytes, (int) columnReader.pageReadStatus.readPosInBytes); lengthVarFieldsInCurrentRecord += columnReader.dataTypeLengthInBits; - } for (NullableVarLengthColumn columnReader : nullableColumns) { if (columnReader.pageReadStatus.currentPage == null @@ -162,10 +161,6 @@ public class VarLenBinaryReader { columnReader.pageReadStatus.valuesRead++; columnReader.valuesReadInCurrentPass++; currVec.getMutator().setValueCount((int)recordsReadInCurrentPass); - // reached the end of a page - if ( columnReader.pageReadStatus.valuesRead == columnReader.pageReadStatus.currentPage.getValueCount()) { - columnReader.pageReadStatus.next(); - } } for (NullableVarLengthColumn columnReader : nullableColumns) { bytes = columnReader.pageReadStatus.pageDataByteArray; http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/60e2080f/exec/java-exec/src/test/java/org/apache/drill/exec/store/ParquetRecordReaderTest.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/store/ParquetRecordReaderTest.java b/exec/java-exec/src/test/java/org/apache/drill/exec/store/ParquetRecordReaderTest.java index cf790ac..93f1f73 100644 --- a/exec/java-exec/src/test/java/org/apache/drill/exec/store/ParquetRecordReaderTest.java +++ b/exec/java-exec/src/test/java/org/apache/drill/exec/store/ParquetRecordReaderTest.java @@ -40,7 +40,6 @@ import org.apache.drill.exec.server.Drillbit; import org.apache.drill.exec.server.RemoteServiceSet; import org.apache.drill.exec.store.json.JsonSchemaProvider; -import org.apache.drill.exec.store.parquet.ParquetStorageEngine; import org.apache.drill.exec.vector.BaseDataValueVector; import org.apache.drill.exec.vector.ValueVector; import org.apache.hadoop.conf.Configuration; @@ -93,6 +92,29 @@ public class ParquetRecordReaderTest { props.fields.put("bin2", new FieldInfo("binary", "bin2", -1, bin2Vals, TypeProtos.MinorType.VARBINARY, props)); } + private void populatePigTPCHCustomerFields(ParquetTestProperties props){ + // all of the data in the fieldInfo constructors doesn't matter because the file is generated outside the test + props.fields.put("C_CUSTKEY", new FieldInfo("int32", "integer", 32, intVals, TypeProtos.MinorType.INT, props)); + props.fields.put("C_NATIONKEY", new FieldInfo("int64", "bigInt", 64, longVals, TypeProtos.MinorType.BIGINT, props)); + props.fields.put("C_ACCTBAL", new FieldInfo("float", "f", 32, floatVals, TypeProtos.MinorType.FLOAT4, props)); + props.fields.put("C_NAME", new FieldInfo("double", "d", 64, doubleVals, TypeProtos.MinorType.FLOAT8, props)); + props.fields.put("C_ADDRESS", new FieldInfo("boolean", "b", 1, boolVals, TypeProtos.MinorType.BIT, props)); + props.fields.put("C_PHONE", new FieldInfo("binary", "bin", -1, binVals, TypeProtos.MinorType.VARBINARY, props)); + props.fields.put("C_MKTSEGMENT", new FieldInfo("binary", "bin2", -1, bin2Vals, TypeProtos.MinorType.VARBINARY, props)); + props.fields.put("C_COMMENT", new FieldInfo("binary", "bin2", -1, bin2Vals, TypeProtos.MinorType.VARBINARY, props)); + } + + private void populatePigTPCHSupplierFields(ParquetTestProperties props){ + // all of the data in the fieldInfo constructors doesn't matter because the file is generated outside the test + props.fields.put("S_SUPPKEY", new FieldInfo("int32", "integer", 32, intVals, TypeProtos.MinorType.INT, props)); + props.fields.put("S_NATIONKEY", new FieldInfo("int64", "bigInt", 64, longVals, TypeProtos.MinorType.BIGINT, props)); + props.fields.put("S_ACCTBAL", new FieldInfo("float", "f", 32, floatVals, TypeProtos.MinorType.FLOAT4, props)); + props.fields.put("S_NAME", new FieldInfo("double", "d", 64, doubleVals, TypeProtos.MinorType.FLOAT8, props)); + props.fields.put("S_ADDRESS", new FieldInfo("boolean", "b", 1, boolVals, TypeProtos.MinorType.BIT, props)); + props.fields.put("S_PHONE", new FieldInfo("binary", "bin", -1, binVals, TypeProtos.MinorType.VARBINARY, props)); + props.fields.put("S_COMMENT", new FieldInfo("binary", "bin2", -1, bin2Vals, TypeProtos.MinorType.VARBINARY, props)); + } + @Test public void testMultipleRowGroups() throws Exception { HashMap<String, FieldInfo> fields = new HashMap<>(); @@ -144,6 +166,26 @@ public class ParquetRecordReaderTest { "/tmp/test.parquet", i, props); } + // requires binary file generated by pig from TPCH data, also have to disable assert where data is coming in + @Ignore + @Test + public void testMultipleRowGroupsAndReadsPigError() throws Exception { + HashMap<String, FieldInfo> fields = new HashMap<>(); + ParquetTestProperties props = new ParquetTestProperties(4, 3000, DEFAULT_BYTES_PER_PAGE, fields); + populatePigTPCHCustomerFields(props); +// populatePigTPCHSupplierFields(props); + String readEntries = ""; + // number of times to read the file + int i = 1; + for (int j = 0; j < i; j++){ + readEntries += "{path: \"/tmp/tpc-h/customer\"}"; + if (j < i - 1) + readEntries += ","; + } + testParquetFullEngineEventBased(false, "/parquet_scan_screen_read_entry_replace.json", readEntries, + "/tmp/test.parquet", i, props); + } + @Test public void testMultipleRowGroupsEvent() throws Exception { HashMap<String, FieldInfo> fields = new HashMap<>(); @@ -441,6 +483,11 @@ public class ParquetRecordReaderTest { client.runQuery(UserProtos.QueryType.LOGICAL, Files.toString(FileUtils.getResourceAsFile(plan), Charsets.UTF_8), resultListener); } resultListener.getResults(); + for (String s : resultListener.valuesChecked.keySet()) { + assertEquals("Record count incorrect for column: " + s, + props.recordsPerRowGroup * props.numberRowGroups * numberOfTimesRead, (long) resultListener.valuesChecked.get(s)); + logger.debug("Column {}, Values read:{}", s, resultListener.valuesChecked.get(s)); + } long D = System.nanoTime(); System.out.println(String.format("Took %f s to run query", (float)(D-C) / 1E9)); }
