Updated Branches:
  refs/heads/master 98bc9e19c -> fef22041b

DRILL 211 - index out of bounds error in parquet reader.


Project: http://git-wip-us.apache.org/repos/asf/incubator-drill/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-drill/commit/60e2080f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-drill/tree/60e2080f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-drill/diff/60e2080f

Branch: refs/heads/master
Commit: 60e2080fa557cddfe8146a706612444724efe716
Parents: 98bc9e1
Author: Jason Altekruse <[email protected]>
Authored: Fri Sep 6 00:53:50 2013 -0500
Committer: Jacques Nadeau <[email protected]>
Committed: Thu Sep 5 23:01:15 2013 -0700

----------------------------------------------------------------------
 .../exec/store/parquet/VarLenBinaryReader.java  |  5 --
 .../exec/store/ParquetRecordReaderTest.java     | 49 +++++++++++++++++++-
 2 files changed, 48 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/60e2080f/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/VarLenBinaryReader.java
----------------------------------------------------------------------
diff --git 
a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/VarLenBinaryReader.java
 
b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/VarLenBinaryReader.java
index f20a2f3..3286314 100644
--- 
a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/VarLenBinaryReader.java
+++ 
b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/VarLenBinaryReader.java
@@ -116,7 +116,6 @@ public class VarLenBinaryReader {
         columnReader.dataTypeLengthInBits = 
BytesUtils.readIntLittleEndian(bytes,
             (int) columnReader.pageReadStatus.readPosInBytes);
         lengthVarFieldsInCurrentRecord += columnReader.dataTypeLengthInBits;
-
       }
       for (NullableVarLengthColumn columnReader : nullableColumns) {
         if (columnReader.pageReadStatus.currentPage == null
@@ -162,10 +161,6 @@ public class VarLenBinaryReader {
         columnReader.pageReadStatus.valuesRead++;
         columnReader.valuesReadInCurrentPass++;
         currVec.getMutator().setValueCount((int)recordsReadInCurrentPass);
-        // reached the end of a page
-        if ( columnReader.pageReadStatus.valuesRead == 
columnReader.pageReadStatus.currentPage.getValueCount()) {
-          columnReader.pageReadStatus.next();
-        }
       }
       for (NullableVarLengthColumn columnReader : nullableColumns) {
         bytes = columnReader.pageReadStatus.pageDataByteArray;

http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/60e2080f/exec/java-exec/src/test/java/org/apache/drill/exec/store/ParquetRecordReaderTest.java
----------------------------------------------------------------------
diff --git 
a/exec/java-exec/src/test/java/org/apache/drill/exec/store/ParquetRecordReaderTest.java
 
b/exec/java-exec/src/test/java/org/apache/drill/exec/store/ParquetRecordReaderTest.java
index cf790ac..93f1f73 100644
--- 
a/exec/java-exec/src/test/java/org/apache/drill/exec/store/ParquetRecordReaderTest.java
+++ 
b/exec/java-exec/src/test/java/org/apache/drill/exec/store/ParquetRecordReaderTest.java
@@ -40,7 +40,6 @@ import org.apache.drill.exec.server.Drillbit;
 import org.apache.drill.exec.server.RemoteServiceSet;
 
 import org.apache.drill.exec.store.json.JsonSchemaProvider;
-import org.apache.drill.exec.store.parquet.ParquetStorageEngine;
 import org.apache.drill.exec.vector.BaseDataValueVector;
 import org.apache.drill.exec.vector.ValueVector;
 import org.apache.hadoop.conf.Configuration;
@@ -93,6 +92,29 @@ public class ParquetRecordReaderTest {
     props.fields.put("bin2", new FieldInfo("binary", "bin2", -1, bin2Vals, 
TypeProtos.MinorType.VARBINARY, props));
   }
 
+  private void populatePigTPCHCustomerFields(ParquetTestProperties props){
+    // all of the data in the fieldInfo constructors doesn't matter because 
the file is generated outside the test
+    props.fields.put("C_CUSTKEY", new FieldInfo("int32", "integer", 32, 
intVals, TypeProtos.MinorType.INT, props));
+    props.fields.put("C_NATIONKEY", new FieldInfo("int64", "bigInt", 64, 
longVals, TypeProtos.MinorType.BIGINT, props));
+    props.fields.put("C_ACCTBAL", new FieldInfo("float", "f", 32, floatVals, 
TypeProtos.MinorType.FLOAT4, props));
+    props.fields.put("C_NAME", new FieldInfo("double", "d", 64, doubleVals, 
TypeProtos.MinorType.FLOAT8, props));
+    props.fields.put("C_ADDRESS", new FieldInfo("boolean", "b", 1, boolVals, 
TypeProtos.MinorType.BIT, props));
+    props.fields.put("C_PHONE", new FieldInfo("binary", "bin", -1, binVals, 
TypeProtos.MinorType.VARBINARY, props));
+    props.fields.put("C_MKTSEGMENT", new FieldInfo("binary", "bin2", -1, 
bin2Vals, TypeProtos.MinorType.VARBINARY, props));
+    props.fields.put("C_COMMENT", new FieldInfo("binary", "bin2", -1, 
bin2Vals, TypeProtos.MinorType.VARBINARY, props));
+  }
+
+  private void populatePigTPCHSupplierFields(ParquetTestProperties props){
+    // all of the data in the fieldInfo constructors doesn't matter because 
the file is generated outside the test
+    props.fields.put("S_SUPPKEY", new FieldInfo("int32", "integer", 32, 
intVals, TypeProtos.MinorType.INT, props));
+    props.fields.put("S_NATIONKEY", new FieldInfo("int64", "bigInt", 64, 
longVals, TypeProtos.MinorType.BIGINT, props));
+    props.fields.put("S_ACCTBAL", new FieldInfo("float", "f", 32, floatVals, 
TypeProtos.MinorType.FLOAT4, props));
+    props.fields.put("S_NAME", new FieldInfo("double", "d", 64, doubleVals, 
TypeProtos.MinorType.FLOAT8, props));
+    props.fields.put("S_ADDRESS", new FieldInfo("boolean", "b", 1, boolVals, 
TypeProtos.MinorType.BIT, props));
+    props.fields.put("S_PHONE", new FieldInfo("binary", "bin", -1, binVals, 
TypeProtos.MinorType.VARBINARY, props));
+    props.fields.put("S_COMMENT", new FieldInfo("binary", "bin2", -1, 
bin2Vals, TypeProtos.MinorType.VARBINARY, props));
+  }
+
   @Test
   public void testMultipleRowGroups() throws Exception {
     HashMap<String, FieldInfo> fields = new HashMap<>();
@@ -144,6 +166,26 @@ public class ParquetRecordReaderTest {
         "/tmp/test.parquet", i, props);
   }
 
+  // requires binary file generated by pig from TPCH data, also have to 
disable assert where data is coming in
+  @Ignore
+  @Test
+  public void testMultipleRowGroupsAndReadsPigError() throws Exception {
+    HashMap<String, FieldInfo> fields = new HashMap<>();
+    ParquetTestProperties props = new ParquetTestProperties(4, 3000, 
DEFAULT_BYTES_PER_PAGE, fields);
+    populatePigTPCHCustomerFields(props);
+//    populatePigTPCHSupplierFields(props);
+    String readEntries = "";
+    // number of times to read the file
+    int i = 1;
+    for (int j = 0; j < i; j++){
+      readEntries += "{path: \"/tmp/tpc-h/customer\"}";
+      if (j < i - 1)
+        readEntries += ",";
+    }
+    testParquetFullEngineEventBased(false, 
"/parquet_scan_screen_read_entry_replace.json", readEntries,
+        "/tmp/test.parquet", i, props);
+  }
+
   @Test
   public void testMultipleRowGroupsEvent() throws Exception {
     HashMap<String, FieldInfo> fields = new HashMap<>();
@@ -441,6 +483,11 @@ public class ParquetRecordReaderTest {
         client.runQuery(UserProtos.QueryType.LOGICAL, 
Files.toString(FileUtils.getResourceAsFile(plan), Charsets.UTF_8), 
resultListener);
       }
       resultListener.getResults();
+      for (String s : resultListener.valuesChecked.keySet()) {
+        assertEquals("Record count incorrect for column: " + s,
+            props.recordsPerRowGroup * props.numberRowGroups * 
numberOfTimesRead, (long) resultListener.valuesChecked.get(s));
+        logger.debug("Column {}, Values read:{}", s, 
resultListener.valuesChecked.get(s));
+      }
       long D = System.nanoTime();
       System.out.println(String.format("Took %f s to run query", (float)(D-C) 
/ 1E9));
     }

Reply via email to