[ 
https://issues.apache.org/jira/browse/DRILL-6685?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16586554#comment-16586554
 ] 

ASF GitHub Bot commented on DRILL-6685:
---------------------------------------

Ben-Zvi closed pull request #1433: DRILL-6685: Fixed exception when reading 
Parquet data
URL: https://github.com/apache/drill/pull/1433
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git 
a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/VarLenAbstractPageEntryReader.java
 
b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/VarLenAbstractPageEntryReader.java
index fecf1ce3158..a708f52353f 100644
--- 
a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/VarLenAbstractPageEntryReader.java
+++ 
b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/VarLenAbstractPageEntryReader.java
@@ -97,4 +97,22 @@ protected final boolean load(boolean force) {
   protected final int remainingPageData() {
     return pageInfo.pageDataLen - pageInfo.pageDataOff;
   }
+
+  /**
+   * Fixed length readers calculate upfront the maximum number of entries to 
process as entry length
+   * are known.
+   * @param valuesToRead requested number of values to read
+   * @param entrySz sizeof(integer) + column's precision
+   * @return maximum entries to read within each call (based on the bulk 
entry, entry size, and requested
+   *         number of entries to read)
+   */
+  protected final int getFixedLengthMaxRecordsToRead(int valuesToRead, int 
entrySz) {
+    // Let's start with bulk's entry and requested values-to-read constraints
+    int numEntriesToRead = Math.min(entry.getMaxEntries(), valuesToRead);
+
+    // Now include the size of the fixed entry (since they are fixed)
+    numEntriesToRead = Math.min(numEntriesToRead, buffer.limit() / entrySz);
+
+    return numEntriesToRead;
+  }
 }
diff --git 
a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/VarLenFixedEntryReader.java
 
b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/VarLenFixedEntryReader.java
index a6e7077241a..e66bd051163 100644
--- 
a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/VarLenFixedEntryReader.java
+++ 
b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/VarLenFixedEntryReader.java
@@ -43,7 +43,7 @@ final VarLenColumnBulkEntry getEntry(int valuesToRead) {
 
     final int expectedDataLen = columnPrecInfo.precision;
     final int entrySz = 4 + columnPrecInfo.precision;
-    final int readBatch = Math.min(entry.getMaxEntries(), valuesToRead);
+    final int readBatch = getFixedLengthMaxRecordsToRead(valuesToRead, 
entrySz);
     Preconditions.checkState(readBatch > 0, "Read batch count [%d] should be 
greater than zero", readBatch);
 
     final int[] valueLengths = entry.getValuesLength();
diff --git 
a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/VarLenNullableFixedEntryReader.java
 
b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/VarLenNullableFixedEntryReader.java
index 3869113249b..caf5c73472b 100644
--- 
a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/VarLenNullableFixedEntryReader.java
+++ 
b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/VarLenNullableFixedEntryReader.java
@@ -45,7 +45,7 @@ final VarLenColumnBulkEntry getEntry(int valuesToRead) {
 
     final int expectedDataLen = columnPrecInfo.precision;
     final int entrySz = 4 + columnPrecInfo.precision;
-    final int readBatch = Math.min(entry.getMaxEntries(), valuesToRead);
+    final int readBatch = getFixedLengthMaxRecordsToRead(valuesToRead, 
entrySz);
     Preconditions.checkState(readBatch > 0, "Read batch count [%s] should be 
greater than zero", readBatch);
 
     final int[] valueLengths = entry.getValuesLength();
diff --git 
a/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestParquetBulkReader.java
 
b/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestParquetBulkReader.java
new file mode 100644
index 00000000000..315ff93be4c
--- /dev/null
+++ 
b/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestParquetBulkReader.java
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.store.parquet;
+
+import org.apache.drill.test.ClusterFixture;
+import org.apache.drill.test.ClusterFixtureBuilder;
+import org.apache.drill.test.ClusterTest;
+import org.apache.drill.exec.ExecConstants;
+import org.apache.drill.exec.planner.physical.PlannerSettings;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/** Tests the Parquet bulk reader */
+public class TestParquetBulkReader extends ClusterTest {
+
+
+  @BeforeClass
+  public static void setup() throws Exception {
+    ClusterFixtureBuilder builder = ClusterFixture.builder(dirTestWatcher);
+    startCluster(builder);
+  }
+
+  private static final String DATAFILE = 
"cp.`parquet/fourvarchar_asc_nulls.parquet`";
+
+  /** Load variable length data which has nulls */
+  @Test
+  public void testNullCount() throws Exception {
+    try {
+      alterSession();
+
+      testBuilder()
+        .sqlQuery("select count(*) as c from %s where VarbinaryValue3 is 
null", DATAFILE)
+        .unOrdered()
+        .baselineColumns("c")
+        .baselineValues(71L)
+        .go();
+
+      testBuilder()
+        .sqlQuery("select count(*) as c from %s where VarbinaryValue1 is 
null", DATAFILE)
+        .unOrdered()
+        .baselineColumns("c")
+        .baselineValues(44L)
+        .go();
+    } finally {
+      resetSession();
+    }
+  }
+
+  /** Load variable length data which has non-nulls data */
+  @Test
+  public void testNotNullCount() throws Exception {
+    try {
+      alterSession();
+
+      testBuilder()
+        .sqlQuery("select count(*) as c from %s where VarbinaryValue3 is not 
null", DATAFILE)
+        .unOrdered()
+        .baselineColumns("c")
+        .baselineValues(0L)
+        .go();
+
+      testBuilder()
+        .sqlQuery("select count(*) as c from %s where VarbinaryValue1 is not 
null", DATAFILE)
+        .unOrdered()
+        .baselineColumns("c")
+        .baselineValues(27L)
+        .go();
+    } finally {
+      resetSession();
+    }
+  }
+
+  /** Load variable columns with fixed length data with large precision and 
null values */
+  @Test
+  public void testFixedLengthWithLargePrecisionAndNulls() throws Exception {
+    try {
+      alterSession();
+
+      testBuilder()
+        .sqlQuery("select count(*) as c from %s where index < 50 and 
length(VarbinaryValue1) = 400", DATAFILE)
+        .unOrdered()
+        .baselineColumns("c")
+        .baselineValues(25L)
+        .go();
+    } finally {
+      resetSession();
+    }
+  }
+
+  /** Load variable length data which was originally fixed length and then 
became variable length */
+  @Test
+  public void testFixedLengthToVarlen() throws Exception {
+    try {
+      alterSession();
+
+      testBuilder()
+        .sqlQuery("select count(*) as c from %s where index < 60 and 
length(VarbinaryValue1) <= 800", DATAFILE)
+        .unOrdered()
+        .baselineColumns("c")
+        .baselineValues(27L)
+        .go();
+    } finally {
+      resetSession();
+    }
+  }
+
+  /** Load variable length data with values larger than chunk size (4k) */
+  @Test
+  public void testLargeVarlen() throws Exception {
+    try {
+      alterSession();
+
+      testBuilder()
+        .sqlQuery("select count(*) as c from %s where length(VarbinaryValue2) 
= 4500", DATAFILE)
+        .unOrdered()
+        .baselineColumns("c")
+        .baselineValues(19L)
+        .go();
+    } finally {
+      resetSession();
+    }
+  }
+
+  private void alterSession() {
+    client.alterSession(ExecConstants.PARQUET_FLAT_READER_BULK, true);
+    
client.alterSession(PlannerSettings.PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_KEY,
 false);
+  }
+
+  private void resetSession() {
+    client.resetSession(ExecConstants.PARQUET_FLAT_READER_BULK);
+    
client.resetSession(PlannerSettings.PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_KEY);
+  }
+
+}
diff --git 
a/exec/java-exec/src/test/resources/parquet/fourvarchar_asc_nulls.parquet 
b/exec/java-exec/src/test/resources/parquet/fourvarchar_asc_nulls.parquet
new file mode 100644
index 00000000000..abe4aafc595
Binary files /dev/null and 
b/exec/java-exec/src/test/resources/parquet/fourvarchar_asc_nulls.parquet differ


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> Error in parquet record reader
> ------------------------------
>
>                 Key: DRILL-6685
>                 URL: https://issues.apache.org/jira/browse/DRILL-6685
>             Project: Apache Drill
>          Issue Type: Bug
>          Components: Storage - Parquet
>    Affects Versions: 1.14.0
>            Reporter: Robert Hou
>            Assignee: salim achouche
>            Priority: Major
>              Labels: ready-to-commit
>             Fix For: 1.15.0
>
>         Attachments: drillbit.log.6685
>
>
> This is the query:
> select VarbinaryValue1 from 
> dfs.`/drill/testdata/batch_memory/fourvarchar_asc_nulls_16MB.parquet` limit 
> 36;
> It appears to be caused by this commit:
> DRILL-6570: Fixed IndexOutofBoundException in Parquet Reader
> aee899c1b26ebb9a5781d280d5a73b42c273d4d5
> This is the stack trace:
> {noformat}
> Error: INTERNAL_ERROR ERROR: Error in parquet record reader.
> Message: 
> Hadoop path: 
> /drill/testdata/batch_memory/fourvarchar_asc_nulls_16MB.parquet/0_0_0.parquet
> Total records read: 0
> Row group index: 0
> Records in row group: 1250
> Parquet Metadata: ParquetMetaData{FileMetaData{schema: message root {
>   optional int64 Index;
>   optional binary VarbinaryValue1;
>   optional int64 BigIntValue;
>   optional boolean BooleanValue;
>   optional int32 DateValue (DATE);
>   optional float FloatValue;
>   optional binary VarcharValue1 (UTF8);
>   optional double DoubleValue;
>   optional int32 IntegerValue;
>   optional int32 TimeValue (TIME_MILLIS);
>   optional int64 TimestampValue (TIMESTAMP_MILLIS);
>   optional binary VarbinaryValue2;
>   optional fixed_len_byte_array(12) IntervalYearValue (INTERVAL);
>   optional fixed_len_byte_array(12) IntervalDayValue (INTERVAL);
>   optional fixed_len_byte_array(12) IntervalSecondValue (INTERVAL);
>   optional binary VarcharValue2 (UTF8);
> }
> , metadata: {drill-writer.version=2, drill.version=1.14.0-SNAPSHOT}}, blocks: 
> [BlockMetaData{1250, 23750308 [ColumnMetaData{UNCOMPRESSED [Index] optional 
> int64 Index  [PLAIN, RLE, BIT_PACKED], 4}, ColumnMetaData{UNCOMPRESSED 
> [VarbinaryValue1] optional binary VarbinaryValue1  [PLAIN, RLE, BIT_PACKED], 
> 10057}, ColumnMetaData{UNCOMPRESSED [BigIntValue] optional int64 BigIntValue  
> [PLAIN, RLE, BIT_PACKED], 8174655}, ColumnMetaData{UNCOMPRESSED 
> [BooleanValue] optional boolean BooleanValue  [PLAIN, RLE, BIT_PACKED], 
> 8179722}, ColumnMetaData{UNCOMPRESSED [DateValue] optional int32 DateValue 
> (DATE)  [PLAIN, RLE, BIT_PACKED], 8179916}, ColumnMetaData{UNCOMPRESSED 
> [FloatValue] optional float FloatValue  [PLAIN, RLE, BIT_PACKED], 8184959}, 
> ColumnMetaData{UNCOMPRESSED [VarcharValue1] optional binary VarcharValue1 
> (UTF8)  [PLAIN, RLE, BIT_PACKED], 8190002}, ColumnMetaData{UNCOMPRESSED 
> [DoubleValue] optional double DoubleValue  [PLAIN, RLE, BIT_PACKED], 
> 10230058}, ColumnMetaData{UNCOMPRESSED [IntegerValue] optional int32 
> IntegerValue  [PLAIN, RLE, BIT_PACKED], 10240111}, 
> ColumnMetaData{UNCOMPRESSED [TimeValue] optional int32 TimeValue 
> (TIME_MILLIS)  [PLAIN, RLE, BIT_PACKED], 10245154}, 
> ColumnMetaData{UNCOMPRESSED [TimestampValue] optional int64 TimestampValue 
> (TIMESTAMP_MILLIS)  [PLAIN, RLE, BIT_PACKED], 10250197}, 
> ColumnMetaData{UNCOMPRESSED [VarbinaryValue2] optional binary VarbinaryValue2 
>  [PLAIN, RLE, BIT_PACKED], 10260250}, ColumnMetaData{UNCOMPRESSED 
> [IntervalYearValue] optional fixed_len_byte_array(12) IntervalYearValue 
> (INTERVAL)  [PLAIN, RLE, BIT_PACKED], 19632385}, ColumnMetaData{UNCOMPRESSED 
> [IntervalDayValue] optional fixed_len_byte_array(12) IntervalDayValue 
> (INTERVAL)  [PLAIN, RLE, BIT_PACKED], 19647446}, ColumnMetaData{UNCOMPRESSED 
> [IntervalSecondValue] optional fixed_len_byte_array(12) IntervalSecondValue 
> (INTERVAL)  [PLAIN, RLE, BIT_PACKED], 19662507}, ColumnMetaData{UNCOMPRESSED 
> [VarcharValue2] optional binary VarcharValue2 (UTF8)  [PLAIN, RLE, 
> BIT_PACKED], 19677568}]}]}
> Fragment 0:0
> [Error Id: 25852cdb-3217-4041-9743-66e9f3a2fbe4 on qa-node186.qa.lab:31010] 
> (state=,code=0)
> {noformat}
> Table can be found in 10.10.100.186:/tmp/fourvarchar_asc_nulls_16MB.parquet
> sys.version is:
> 1.15.0-SNAPSHOT a05f17d6fcd80f0d21260d3b1074ab895f457bac        Changed 
> PROJECT_OUTPUT_BATCH_SIZE to System + Session   30.07.2018 @ 17:12:53 PDT     
>   [email protected]   30.07.2018 @ 17:25:21 PDT^M
> fourvarchar_asc_nulls70.q



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to