Repository: hive Updated Branches: refs/heads/master 7942bc6c9 -> 17abdb211
HIVE-18411: Fix ArrayIndexOutOfBoundsException for VectorizedListColumnReader (Colin Ma, reviewed by Ferdinand Xu) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/17abdb21 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/17abdb21 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/17abdb21 Branch: refs/heads/master Commit: 17abdb211c1b2b749fc7d8265d31e6c5987cea4b Parents: 7942bc6 Author: Ferdinand Xu <cheng.a...@intel.com> Authored: Wed Jan 17 15:39:54 2018 +0800 Committer: Ferdinand Xu <cheng.a...@intel.com> Committed: Wed Jan 17 15:39:54 2018 +0800 ---------------------------------------------------------------------- .../vector/VectorizedListColumnReader.java | 5 +++ .../parquet/TestVectorizedListColumnReader.java | 33 ++++++++++++++++++++ 2 files changed, 38 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/17abdb21/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java index 12af77c..04fa129 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java @@ -19,6 +19,7 @@ import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; @@ -52,6 +53,10 @@ public class VectorizedListColumnReader extends BaseVectorizedColumnReader { @Override public void readBatch(int total, ColumnVector column, TypeInfo columnType) throws IOException { ListColumnVector lcv = (ListColumnVector) column; + // before readBatch, initial the size of offsets & lengths as the default value, + // the actual size will be assigned in setChildrenInfo() after reading complete. + lcv.offsets = new long[VectorizedRowBatch.DEFAULT_SIZE]; + lcv.lengths = new long[VectorizedRowBatch.DEFAULT_SIZE]; // Because the length of ListColumnVector.child can't be known now, // the valueList will save all data for ListColumnVector temporary. List<Object> valueList = new ArrayList<>(); http://git-wip-us.apache.org/repos/asf/hive/blob/17abdb21/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedListColumnReader.java ---------------------------------------------------------------------- diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedListColumnReader.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedListColumnReader.java index 8ea5d25..d241fc8 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedListColumnReader.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedListColumnReader.java @@ -166,6 +166,14 @@ public class TestVectorizedListColumnReader extends VectorizedColumnReaderTestBa removeFile(); } + @Test + public void testVectorizedRowBatchSizeChange() throws Exception { + removeFile(); + writeListData(initWriterFromFile(), false, 1200); + testVectorizedRowBatchSizeChangeListRead(); + removeFile(); + } + private void testListReadAllType(boolean isDictionaryEncoding, int elementNum) throws Exception { testListRead(isDictionaryEncoding, "int", elementNum); testListRead(isDictionaryEncoding, "long", elementNum); @@ -337,4 +345,29 @@ public class TestVectorizedListColumnReader extends VectorizedColumnReaderTestBa reader.close(); } } + + private void testVectorizedRowBatchSizeChangeListRead() throws Exception { + Configuration conf = new Configuration(); + conf.set(IOConstants.COLUMNS, "list_binary_field_for_repeat_test"); + conf.set(IOConstants.COLUMNS_TYPES, "array<string>"); + conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); + conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); + VectorizedParquetRecordReader reader = createTestParquetReader( + "message hive_schema {repeated binary list_binary_field_for_repeat_test;}", conf); + VectorizedRowBatch previous = reader.createValue(); + try { + while (reader.next(NullWritable.get(), previous)) { + ListColumnVector vector = (ListColumnVector) previous.cols[0]; + // When deal with big data, the VectorizedRowBatch will be used for the different file split + // to cache the data. Here is the situation: the first split only have 100 rows, + // and VectorizedRowBatch cache them, meanwhile, the size of VectorizedRowBatch will be + // updated to 100. The following code is to simulate the size change, but there will be no + // ArrayIndexOutOfBoundsException when process the next split which has more than 100 rows. + vector.lengths = new long[100]; + vector.offsets = new long[100]; + } + } finally { + reader.close(); + } + } }