hive git commit: HIVE-18411: Fix ArrayIndexOutOfBoundsException for VectorizedListColumnReader (Colin Ma, reviewed by Ferdinand Xu)

xuf Wed, 17 Jan 2018 00:03:08 -0800

Repository: hive
Updated Branches:
  refs/heads/master 7942bc6c9 -> 17abdb211



HIVE-18411: Fix ArrayIndexOutOfBoundsException for VectorizedListColumnReader 
(Colin Ma, reviewed by Ferdinand Xu)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/17abdb21
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/17abdb21
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/17abdb21

Branch: refs/heads/master
Commit: 17abdb211c1b2b749fc7d8265d31e6c5987cea4b
Parents: 7942bc6
Author: Ferdinand Xu <cheng.a...@intel.com>
Authored: Wed Jan 17 15:39:54 2018 +0800
Committer: Ferdinand Xu <cheng.a...@intel.com>
Committed: Wed Jan 17 15:39:54 2018 +0800

----------------------------------------------------------------------
 .../vector/VectorizedListColumnReader.java      |  5 +++
 .../parquet/TestVectorizedListColumnReader.java | 33 ++++++++++++++++++++
 2 files changed, 38 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/17abdb21/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java
index 12af77c..04fa129 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java
@@ -19,6 +19,7 @@ import 
org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
 import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
 import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
 import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
@@ -52,6 +53,10 @@ public class VectorizedListColumnReader extends 
BaseVectorizedColumnReader {
   @Override
   public void readBatch(int total, ColumnVector column, TypeInfo columnType) 
throws IOException {
     ListColumnVector lcv = (ListColumnVector) column;
+    // before readBatch, initial the size of offsets & lengths as the default 
value,
+    // the actual size will be assigned in setChildrenInfo() after reading 
complete.
+    lcv.offsets = new long[VectorizedRowBatch.DEFAULT_SIZE];
+    lcv.lengths = new long[VectorizedRowBatch.DEFAULT_SIZE];
     // Because the length of ListColumnVector.child can't be known now,
     // the valueList will save all data for ListColumnVector temporary.
     List<Object> valueList = new ArrayList<>();

http://git-wip-us.apache.org/repos/asf/hive/blob/17abdb21/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedListColumnReader.java
----------------------------------------------------------------------
diff --git 
a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedListColumnReader.java
 
b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedListColumnReader.java
index 8ea5d25..d241fc8 100644
--- 
a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedListColumnReader.java
+++ 
b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedListColumnReader.java
@@ -166,6 +166,14 @@ public class TestVectorizedListColumnReader extends 
VectorizedColumnReaderTestBa
     removeFile();
   }
 
+  @Test
+  public void testVectorizedRowBatchSizeChange() throws Exception {
+    removeFile();
+    writeListData(initWriterFromFile(), false, 1200);
+    testVectorizedRowBatchSizeChangeListRead();
+    removeFile();
+  }
+
   private void testListReadAllType(boolean isDictionaryEncoding, int 
elementNum) throws Exception {
     testListRead(isDictionaryEncoding, "int", elementNum);
     testListRead(isDictionaryEncoding, "long", elementNum);
@@ -337,4 +345,29 @@ public class TestVectorizedListColumnReader extends 
VectorizedColumnReaderTestBa
       reader.close();
     }
   }
+
+  private void testVectorizedRowBatchSizeChangeListRead() throws Exception {
+    Configuration conf = new Configuration();
+    conf.set(IOConstants.COLUMNS, "list_binary_field_for_repeat_test");
+    conf.set(IOConstants.COLUMNS_TYPES, "array<string>");
+    conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
+    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
+    VectorizedParquetRecordReader reader = createTestParquetReader(
+        "message hive_schema {repeated binary 
list_binary_field_for_repeat_test;}", conf);
+    VectorizedRowBatch previous = reader.createValue();
+    try {
+      while (reader.next(NullWritable.get(), previous)) {
+        ListColumnVector vector = (ListColumnVector) previous.cols[0];
+        // When deal with big data, the VectorizedRowBatch will be used for 
the different file split
+        // to cache the data. Here is the situation: the first split only have 
100 rows,
+        // and VectorizedRowBatch cache them, meanwhile, the size of 
VectorizedRowBatch will be
+        // updated to 100. The following code is to simulate the size change, 
but there will be no
+        // ArrayIndexOutOfBoundsException when process the next split which 
has more than 100 rows.
+        vector.lengths = new long[100];
+        vector.offsets = new long[100];
+      }
+    } finally {
+      reader.close();
+    }
+  }
 }

hive git commit: HIVE-18411: Fix ArrayIndexOutOfBoundsException for VectorizedListColumnReader (Colin Ma, reviewed by Ferdinand Xu)

Reply via email to