DRILL-1704: Use complex reader for dictionary encoded files, as original reader seems to be broken
Project: http://git-wip-us.apache.org/repos/asf/incubator-drill/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-drill/commit/b37dc08a Tree: http://git-wip-us.apache.org/repos/asf/incubator-drill/tree/b37dc08a Diff: http://git-wip-us.apache.org/repos/asf/incubator-drill/diff/b37dc08a Branch: refs/heads/master Commit: b37dc08a46dea1b1c0bfeba21ea73dd6ac0116bd Parents: 116f6d1 Author: Jason Altekruse <altekruseja...@gmail.com> Authored: Wed Nov 12 18:10:32 2014 -0800 Committer: Jacques Nadeau <jacq...@apache.org> Committed: Thu Nov 13 09:17:36 2014 -0800 ---------------------------------------------------------------------- .../exec/store/parquet/ParquetScanBatchCreator.java | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/b37dc08a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetScanBatchCreator.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetScanBatchCreator.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetScanBatchCreator.java index 53a6ffc..4467825 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetScanBatchCreator.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetScanBatchCreator.java @@ -41,7 +41,9 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import parquet.column.ColumnDescriptor; +import parquet.column.Encoding; import parquet.hadoop.ParquetFileReader; +import parquet.hadoop.metadata.ColumnChunkMetaData; import parquet.hadoop.metadata.ParquetMetadata; import parquet.schema.MessageType; import parquet.schema.Type; @@ -107,7 +109,7 @@ public class ParquetScanBatchCreator implements BatchCreator<ParquetRowGroupScan footers.put(e.getPath(), ParquetFileReader.readFooter( fs.getConf(), new Path(e.getPath()))); } - if (!context.getOptions().getOption(ExecConstants.PARQUET_NEW_RECORD_READER).bool_val && !isComplex(footers.get(e.getPath()))) { + if (!context.getOptions().getOption(ExecConstants.PARQUET_NEW_RECORD_READER).bool_val && !isComplex(footers.get(e.getPath()), e.getRowGroupIndex())) { readers.add( new ParquetRecordReader( context, e.getPath(), e.getRowGroupIndex(), fs, @@ -153,7 +155,12 @@ public class ParquetScanBatchCreator implements BatchCreator<ParquetRowGroupScan return s; } - private static boolean isComplex(ParquetMetadata footer) { + private static boolean isComplex(ParquetMetadata footer, int rowGroupIndex) { + for (ColumnChunkMetaData md : footer.getBlocks().get(rowGroupIndex).getColumns()) { + if (md.getEncodings().contains(Encoding.PLAIN_DICTIONARY)) { + return true; // for now, use Complex reader for Dictionary encoded + } + } MessageType schema = footer.getFileMetaData().getSchema(); for (Type type : schema.getFields()) { @@ -161,6 +168,7 @@ public class ParquetScanBatchCreator implements BatchCreator<ParquetRowGroupScan return true; } } + ColumnDescriptor desc; for (ColumnDescriptor col : schema.getColumns()) { if (col.getMaxRepetitionLevel() > 0) { return true;