Repository: incubator-drill Updated Branches: refs/heads/master fd54e71e8 -> 108d29fce
DRILL-1701: Fix for nullable dictionary columns in optimized parquet reader. Re-enable the optimized reader for dictionary encoded files. Project: http://git-wip-us.apache.org/repos/asf/incubator-drill/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-drill/commit/ade74b18 Tree: http://git-wip-us.apache.org/repos/asf/incubator-drill/tree/ade74b18 Diff: http://git-wip-us.apache.org/repos/asf/incubator-drill/diff/ade74b18 Branch: refs/heads/master Commit: ade74b18dec5645cb8ce1fb4d0aa9ec40839895a Parents: fd54e71 Author: Jason Altekruse <altekruseja...@gmail.com> Authored: Thu Nov 13 14:02:51 2014 -0800 Committer: Jason Altekruse <altekruseja...@gmail.com> Committed: Thu Nov 13 14:52:55 2014 -0800 ---------------------------------------------------------------------- .../exec/store/parquet/ParquetScanBatchCreator.java | 12 ++---------- .../columnreaders/NullableVarLengthValuesColumn.java | 10 +++++----- 2 files changed, 7 insertions(+), 15 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/ade74b18/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetScanBatchCreator.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetScanBatchCreator.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetScanBatchCreator.java index 4467825..53a6ffc 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetScanBatchCreator.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetScanBatchCreator.java @@ -41,9 +41,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import parquet.column.ColumnDescriptor; -import parquet.column.Encoding; import parquet.hadoop.ParquetFileReader; -import parquet.hadoop.metadata.ColumnChunkMetaData; import parquet.hadoop.metadata.ParquetMetadata; import parquet.schema.MessageType; import parquet.schema.Type; @@ -109,7 +107,7 @@ public class ParquetScanBatchCreator implements BatchCreator<ParquetRowGroupScan footers.put(e.getPath(), ParquetFileReader.readFooter( fs.getConf(), new Path(e.getPath()))); } - if (!context.getOptions().getOption(ExecConstants.PARQUET_NEW_RECORD_READER).bool_val && !isComplex(footers.get(e.getPath()), e.getRowGroupIndex())) { + if (!context.getOptions().getOption(ExecConstants.PARQUET_NEW_RECORD_READER).bool_val && !isComplex(footers.get(e.getPath()))) { readers.add( new ParquetRecordReader( context, e.getPath(), e.getRowGroupIndex(), fs, @@ -155,12 +153,7 @@ public class ParquetScanBatchCreator implements BatchCreator<ParquetRowGroupScan return s; } - private static boolean isComplex(ParquetMetadata footer, int rowGroupIndex) { - for (ColumnChunkMetaData md : footer.getBlocks().get(rowGroupIndex).getColumns()) { - if (md.getEncodings().contains(Encoding.PLAIN_DICTIONARY)) { - return true; // for now, use Complex reader for Dictionary encoded - } - } + private static boolean isComplex(ParquetMetadata footer) { MessageType schema = footer.getFileMetaData().getSchema(); for (Type type : schema.getFields()) { @@ -168,7 +161,6 @@ public class ParquetScanBatchCreator implements BatchCreator<ParquetRowGroupScan return true; } } - ColumnDescriptor desc; for (ColumnDescriptor col : schema.getColumns()) { if (col.getMaxRepetitionLevel() > 0) { return true; http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/ade74b18/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/NullableVarLengthValuesColumn.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/NullableVarLengthValuesColumn.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/NullableVarLengthValuesColumn.java index 2e24674..aa3d9c5 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/NullableVarLengthValuesColumn.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/NullableVarLengthValuesColumn.java @@ -122,14 +122,14 @@ public abstract class NullableVarLengthValuesColumn<V extends ValueVector> exten protected void readField(long recordsToRead) { // TODO - unlike most implementations of this method, the recordsReadInThisIteration field is not set here // should verify that this is not breaking anything - if (usingDictionary) { - currDictValToWrite = pageReader.dictionaryValueReader.readBytes(); - // re-purposing this field here for length in BYTES to prevent repetitive multiplication/division - } - dataTypeLengthInBits = variableWidthVector.getAccessor().getValueLength(valuesReadInCurrentPass); currentValNull = variableWidthVector.getAccessor().getObject(valuesReadInCurrentPass) == null; // again, I am re-purposing the unused field here, it is a length n BYTES, not bits if (! currentValNull) { + if (usingDictionary) { + currDictValToWrite = pageReader.dictionaryValueReader.readBytes(); + } + // re-purposing this field here for length in BYTES to prevent repetitive multiplication/division + dataTypeLengthInBits = variableWidthVector.getAccessor().getValueLength(valuesReadInCurrentPass); boolean success = setSafe(valuesReadInCurrentPass, pageReader.pageDataByteArray, (int) pageReader.readPosInBytes + 4, dataTypeLengthInBits); assert success;