[ https://issues.apache.org/jira/browse/DRILL-6570?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16535180#comment-16535180 ]
ASF GitHub Bot commented on DRILL-6570: --------------------------------------- Ben-Zvi closed pull request #1354: DRILL-6570: Fixed IndexOutofBoundException in Parquet Reader URL: https://github.com/apache/drill/pull/1354 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/VarLenColumnBulkEntry.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/VarLenColumnBulkEntry.java index bc7741553d..e37700a3f1 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/VarLenColumnBulkEntry.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/VarLenColumnBulkEntry.java @@ -18,7 +18,7 @@ package org.apache.drill.exec.store.parquet.columnreaders; import org.apache.drill.exec.store.parquet.columnreaders.VarLenColumnBulkInput.ColumnPrecisionInfo; -import org.apache.drill.exec.store.parquet.columnreaders.VarLenColumnBulkInput.ColumnPrecisionType; +import org.apache.drill.exec.vector.UInt4Vector; import org.apache.drill.exec.vector.VarLenBulkEntry; import io.netty.buffer.DrillBuf; @@ -55,25 +55,17 @@ } VarLenColumnBulkEntry(ColumnPrecisionInfo columnPrecInfo, int buffSz) { - int lengthSz = -1; - int dataSz = -1; - - if (ColumnPrecisionType.isPrecTypeFixed(columnPrecInfo.columnPrecisionType)) { - final int expectedDataLen = columnPrecInfo.precision; - final int maxNumValues = buffSz / (4 + expectedDataLen); - lengthSz = maxNumValues; - dataSz = maxNumValues * expectedDataLen + PADDING; - - } else { - // For variable length data, we need to handle a) maximum number of entries and b) max entry length - final int smallestDataLen = 1; - final int largestDataLen = buffSz - 4; - final int maxNumValues = buffSz / (4 + smallestDataLen); - lengthSz = maxNumValues; - dataSz = largestDataLen + PADDING; - } - - this.lengths = new int[lengthSz]; + + // For variable length data, we need to handle a) maximum number of entries + // and b) max entry length. Note that we don't optimize for fixed length + // columns as the reader can notice a false-positive (that is, the first + // values were fixed but not the rest). + final int largestDataLen = buffSz - UInt4Vector.VALUE_WIDTH; + final int maxNumValues = buffSz / UInt4Vector.VALUE_WIDTH; + final int lengthSz = maxNumValues; + final int dataSz = largestDataLen + PADDING; + + this.lengths = new int[lengthSz]; this.internalArray = new byte[dataSz]; } ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > IndexOutOfBoundsException when using Flat Parquet Reader > --------------------------------------------------------- > > Key: DRILL-6570 > URL: https://issues.apache.org/jira/browse/DRILL-6570 > Project: Apache Drill > Issue Type: Bug > Components: Storage - Parquet > Reporter: salim achouche > Assignee: salim achouche > Priority: Major > Labels: ready-to-commit > Fix For: 1.14.0 > > Original Estimate: 2h > Remaining Estimate: 2h > > * The Parquet Reader creates a reusable bulk entry based on the column > precision > * It uses the column precision for optimizing the intermediary heap buffers > * It first detected the column was fixed length but then it reverted this > assumption when the column changed precision > * This step was fine except the bulk entry memory requirement changed though > the code didn't update the bulk entry intermediary buffers > -- This message was sent by Atlassian JIRA (v7.6.3#76005)