DRILL-4048: Fix reading required dictionary encoded varbinary data in parquet files after recent update
Fix was small, this update is a little larger than necessary because I was hoping to create a unit test by modifying the one I had added in the earlier patch with the version upgrade. Unfortunately we don't have a good way to generate Parquet files with required columns from unit tests right now. So I just added a smaller subset of the binary file that was posted on the JIRA issue. The refactoring of the earlier test was still useful for readability, so I kept it in. Project: http://git-wip-us.apache.org/repos/asf/drill/repo Commit: http://git-wip-us.apache.org/repos/asf/drill/commit/a5a1aa6b Tree: http://git-wip-us.apache.org/repos/asf/drill/tree/a5a1aa6b Diff: http://git-wip-us.apache.org/repos/asf/drill/diff/a5a1aa6b Branch: refs/heads/master Commit: a5a1aa6b487d0642641ce70f2c2500465956f9ec Parents: ffe0240 Author: Jason Altekruse <[email protected]> Authored: Fri Nov 6 19:24:28 2015 -0800 Committer: Jacques Nadeau <[email protected]> Committed: Fri Nov 6 21:41:07 2015 -0800 ---------------------------------------------------------------------- .../columnreaders/VarLengthColumnReaders.java | 3 +- .../physical/impl/writer/TestParquetWriter.java | 71 ++++++++++++++----- .../parquet/required_dictionary.parquet | Bin 0 -> 8020 bytes 3 files changed, 54 insertions(+), 20 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/drill/blob/a5a1aa6b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/VarLengthColumnReaders.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/VarLengthColumnReaders.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/VarLengthColumnReaders.java index ba126d2..39cad0e 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/VarLengthColumnReaders.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/VarLengthColumnReaders.java @@ -187,7 +187,8 @@ public class VarLengthColumnReaders { if (usingDictionary) { currDictValToWrite = pageReader.dictionaryValueReader.readBytes(); - mutator.setSafe(index, currDictValToWrite.toByteBuffer(), 0, currDictValToWrite.length()); + ByteBuffer buf = currDictValToWrite.toByteBuffer(); + mutator.setSafe(index, buf, buf.position(), currDictValToWrite.length()); } else { mutator.setSafe(index, start, start + length, bytebuf); } http://git-wip-us.apache.org/repos/asf/drill/blob/a5a1aa6b/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestParquetWriter.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestParquetWriter.java b/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestParquetWriter.java index 51d5d08..007561d 100644 --- a/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestParquetWriter.java +++ b/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestParquetWriter.java @@ -21,7 +21,12 @@ import java.io.File; import java.io.FileWriter; import java.math.BigDecimal; import java.sql.Date; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import com.google.common.base.Joiner; import org.apache.drill.BaseTestQuery; import org.apache.drill.exec.ExecConstants; import org.apache.drill.exec.fn.interp.TestConstantFolding; @@ -45,25 +50,46 @@ public class TestParquetWriter extends BaseTestQuery { public TemporaryFolder folder = new TemporaryFolder(); static FileSystem fs; - private String allTypesSelection = - "cast( `int_col` AS int) `int_col`, " + - "cast( `bigint_col` AS bigint) `bigint_col`, " + - // TODO(DRILL-3367) -// "cast( `decimal9_col` AS decimal(9, 4)) `decimal9_col`, " + -// "cast( `decimal18_col` AS decimal(18,9)) `decimal18_col`, " + -// "cast( `decimal28sparse_col` AS decimal(28, 14)) `decimal28sparse_col`, " + -// "cast( `decimal38sparse_col` AS decimal(38, 19)) `decimal38sparse_col`, " + - "cast( `date_col` AS date) `date_col`, " + - "cast( `timestamp_col` AS timestamp) `timestamp_col`, " + - "cast( `float4_col` AS float) `float4_col`, " + - "cast( `float8_col` AS double) `float8_col`, " + - "cast( `varbinary_col` AS varbinary(65000)) `varbinary_col`, " + - // TODO(DRILL-2297) -// "cast( `intervalyear_col` AS interval year) `intervalyear_col`, " + - "cast( `intervalday_col` AS interval day) `intervalday_col`, " + - "cast( `bit_col` AS boolean) `bit_col`, " + - " `varchar_col` `varchar_col`, " + - "cast( `time_col` AS time) `time_col` "; + // Map storing a convenient name as well as the cast type necessary + // to produce it casting from a varchar + private static final Map<String, String> allTypes = new HashMap<>(); + + // Select statement for all supported Drill types, for use in conjunction with + // the file parquet/alltypes.json in the resources directory + private static final String allTypesSelection; + + static { + allTypes.put("int", "int"); + allTypes.put("bigint", "bigint"); + // TODO(DRILL-3367) +// allTypes.put("decimal(9, 4)", "decimal9"); +// allTypes.put("decimal(18,9)", "decimal18"); +// allTypes.put("decimal(28, 14)", "decimal28sparse"); +// allTypes.put("decimal(38, 19)", "decimal38sparse"); + allTypes.put("date", "date"); + allTypes.put("timestamp", "timestamp"); + allTypes.put("float", "float4"); + allTypes.put("double", "float8"); + allTypes.put("varbinary(65000)", "varbinary"); + // TODO(DRILL-2297) +// allTypes.put("interval year", "intervalyear"); + allTypes.put("interval day", "intervalday"); + allTypes.put("boolean", "bit"); + allTypes.put("varchar", "varchar"); + allTypes.put("time", "time"); + + List<String> allTypeSelectsAndCasts = new ArrayList<>(); + for (String s : allTypes.keySet()) { + // don't need to cast a varchar, just add the column reference + if (s.equals("varchar")) { + allTypeSelectsAndCasts.add(String.format("`%s_col`", allTypes.get(s))); + continue; + } + allTypeSelectsAndCasts.add(String.format("cast(`%s_col` AS %S) `%s_col`", allTypes.get(s), s, allTypes.get(s))); + } + allTypesSelection = Joiner.on(",").join(allTypeSelectsAndCasts); + } + private String allTypesTable = "cp.`/parquet/alltypes.json`"; @@ -158,6 +184,12 @@ public class TestParquetWriter extends BaseTestQuery { } @Test + public void testDictionaryError() throws Exception { + compareParquetReadersColumnar("*", "cp.`parquet/required_dictionary.parquet`"); + runTestAndValidate("*", "*", "cp.`parquet/required_dictionary.parquet`", "required_dictionary"); + } + + @Test public void testDictionaryEncoding() throws Exception { String selection = "type"; String inputTable = "cp.`donuts.json`"; @@ -668,6 +700,7 @@ public class TestParquetWriter extends BaseTestQuery { public void runTestAndValidate(String selection, String validationSelection, String inputTable, String outputFile) throws Exception { try { + deleteTableIfExists(outputFile); test("use dfs_test.tmp"); // test("ALTER SESSION SET `planner.add_producer_consumer` = false"); String query = String.format("SELECT %s FROM %s", selection, inputTable); http://git-wip-us.apache.org/repos/asf/drill/blob/a5a1aa6b/exec/java-exec/src/test/resources/parquet/required_dictionary.parquet ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/resources/parquet/required_dictionary.parquet b/exec/java-exec/src/test/resources/parquet/required_dictionary.parquet new file mode 100644 index 0000000..c837192 Binary files /dev/null and b/exec/java-exec/src/test/resources/parquet/required_dictionary.parquet differ
