[
https://issues.apache.org/jira/browse/ARROW-1943?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16301848#comment-16301848
]
ASF GitHub Bot commented on ARROW-1943:
---------------------------------------
siddharthteotia closed pull request #1439: ARROW-1943: [JAVA] handle
setInitialCapacity for deeply nested lists
URL: https://github.com/apache/arrow/pull/1439
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git
a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
index a9221f2f6..9a23fd8c3 100644
---
a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
+++
b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
@@ -29,6 +29,8 @@
import org.apache.arrow.vector.UInt4Vector;
import org.apache.arrow.vector.ValueVector;
import org.apache.arrow.vector.ZeroVector;
+import org.apache.arrow.vector.BaseVariableWidthVector;
+import org.apache.arrow.vector.BaseFixedWidthVector;
import org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeID;
import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.util.CallBack;
@@ -134,7 +136,11 @@ public FieldVector getDataVector() {
@Override
public void setInitialCapacity(int numRecords) {
offsetAllocationSizeInBytes = (numRecords + 1) * OFFSET_WIDTH;
- vector.setInitialCapacity(numRecords *
RepeatedValueVector.DEFAULT_REPEAT_PER_RECORD);
+ if (vector instanceof BaseFixedWidthVector || vector instanceof
BaseVariableWidthVector) {
+ vector.setInitialCapacity(numRecords *
RepeatedValueVector.DEFAULT_REPEAT_PER_RECORD);
+ } else {
+ vector.setInitialCapacity(numRecords);
+ }
}
@Override
diff --git
a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
index 1acce7e0b..e2023f446 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
@@ -559,6 +559,134 @@ public void testNestedListVector() throws Exception {
}
}
+ @Test
+ public void testNestedListVector1() throws Exception {
+ try (ListVector listVector = ListVector.empty("sourceVector", allocator)) {
+
+ MinorType listType = MinorType.LIST;
+ MinorType scalarType = MinorType.BIGINT;
+
+ listVector.addOrGetVector(FieldType.nullable(listType.getType()));
+
+ ListVector innerList1 = (ListVector)listVector.getDataVector();
+ innerList1.addOrGetVector(FieldType.nullable(listType.getType()));
+
+ ListVector innerList2 = (ListVector)innerList1.getDataVector();
+ innerList2.addOrGetVector(FieldType.nullable(listType.getType()));
+
+ ListVector innerList3 = (ListVector)innerList2.getDataVector();
+ innerList3.addOrGetVector(FieldType.nullable(listType.getType()));
+
+ ListVector innerList4 = (ListVector)innerList3.getDataVector();
+ innerList4.addOrGetVector(FieldType.nullable(listType.getType()));
+
+ ListVector innerList5 = (ListVector)innerList4.getDataVector();
+ innerList5.addOrGetVector(FieldType.nullable(listType.getType()));
+
+ ListVector innerList6 = (ListVector)innerList5.getDataVector();
+ innerList6.addOrGetVector(FieldType.nullable(scalarType.getType()));
+
+ listVector.setInitialCapacity(128);
+ }
+ }
+
+ @Test
+ public void testNestedListVector2() throws Exception {
+ try (ListVector listVector = ListVector.empty("sourceVector", allocator)) {
+ listVector.setInitialCapacity(1);
+ UnionListWriter listWriter = listVector.getWriter();
+ /* allocate memory */
+ listWriter.allocate();
+
+ /* write one or more inner lists at index 0 */
+ listWriter.setPosition(0);
+ listWriter.startList();
+
+ listWriter.list().startList();
+ listWriter.list().bigInt().writeBigInt(50);
+ listWriter.list().bigInt().writeBigInt(100);
+ listWriter.list().bigInt().writeBigInt(200);
+ listWriter.list().endList();
+
+ listWriter.list().startList();
+ listWriter.list().bigInt().writeBigInt(75);
+ listWriter.list().bigInt().writeBigInt(125);
+ listWriter.list().endList();
+
+ listWriter.endList();
+
+ /* write one or more inner lists at index 1 */
+ listWriter.setPosition(1);
+ listWriter.startList();
+
+ listWriter.list().startList();
+ listWriter.list().bigInt().writeBigInt(15);
+ listWriter.list().bigInt().writeBigInt(20);
+ listWriter.list().endList();
+
+ listWriter.list().startList();
+ listWriter.list().bigInt().writeBigInt(25);
+ listWriter.list().bigInt().writeBigInt(30);
+ listWriter.list().bigInt().writeBigInt(35);
+ listWriter.list().endList();
+
+ listWriter.endList();
+
+ assertEquals(2, listVector.getLastSet());
+
+ listVector.setValueCount(2);
+
+ assertEquals(2, listVector.getValueCount());
+
+ /* get listVector value at index 0 -- the value itself is a listvector */
+ Object result = listVector.getObject(0);
+ ArrayList<ArrayList<Long>> resultSet = (ArrayList<ArrayList<Long>>)
result;
+ ArrayList<Long> list;
+
+ assertEquals(2, resultSet.size()); /* 2 inner lists at
index 0 */
+ assertEquals(3, resultSet.get(0).size()); /* size of first inner
list */
+ assertEquals(2, resultSet.get(1).size()); /* size of second inner
list */
+
+ list = resultSet.get(0);
+ assertEquals(new Long(50), list.get(0));
+ assertEquals(new Long(100), list.get(1));
+ assertEquals(new Long(200), list.get(2));
+
+ list = resultSet.get(1);
+ assertEquals(new Long(75), list.get(0));
+ assertEquals(new Long(125), list.get(1));
+
+ /* get listVector value at index 1 -- the value itself is a listvector
*/
+ result = listVector.getObject(1);
+ resultSet = (ArrayList<ArrayList<Long>>) result;
+
+ assertEquals(2, resultSet.size()); /* 3 inner lists at
index 1 */
+ assertEquals(2, resultSet.get(0).size()); /* size of first inner
list */
+ assertEquals(3, resultSet.get(1).size()); /* size of second inner
list */
+
+ list = resultSet.get(0);
+ assertEquals(new Long(15), list.get(0));
+ assertEquals(new Long(20), list.get(1));
+
+ list = resultSet.get(1);
+ assertEquals(new Long(25), list.get(0));
+ assertEquals(new Long(30), list.get(1));
+ assertEquals(new Long(35), list.get(2));
+
+ /* check underlying bitVector */
+ assertFalse(listVector.isNull(0));
+ assertFalse(listVector.isNull(1));
+
+ /* check underlying offsets */
+ final ArrowBuf offsetBuffer = listVector.getOffsetBuffer();
+
+ /* listVector has 2 lists at index 0 and 3 lists at index 1 */
+ assertEquals(0, offsetBuffer.getInt(0 * ListVector.OFFSET_WIDTH));
+ assertEquals(2, offsetBuffer.getInt(1 * ListVector.OFFSET_WIDTH));
+ assertEquals(4, offsetBuffer.getInt(2 * ListVector.OFFSET_WIDTH));
+ }
+ }
+
@Test
public void testGetBufferAddress() throws Exception {
try (ListVector listVector = ListVector.empty("vector", allocator)) {
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> Handle setInitialCapacity() for deeply nested lists of lists
> ------------------------------------------------------------
>
> Key: ARROW-1943
> URL: https://issues.apache.org/jira/browse/ARROW-1943
> Project: Apache Arrow
> Issue Type: Bug
> Reporter: Siddharth Teotia
> Assignee: Siddharth Teotia
> Labels: pull-request-available
> Fix For: 0.9.0
>
>
> The current implementation of setInitialCapacity() uses a factor of 5 for
> every level we go into list:
> So if the schema is LIST (LIST (LIST (LIST (LIST (LIST (LIST (BIGINT))))))
> and we start with an initial capacity of 128, we end up throwing
> OversizedAllocationException from the BigIntVector because at every level we
> increased the capacity by 5 and by the time we reached inner scalar that
> actually stores the data, we were well over max size limit per vector (1MB).
> We saw this problem in Dremio when we failed to read deeply nested JSON data.
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)