This is an automated email from the ASF dual-hosted git repository.

siddteotia pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 01a099c  ARROW-2199: [JAVA] Control the memory allocated for inner 
vectors in containers. (#1646)
01a099c is described below

commit 01a099c81df7a54c5b22f944a77f7f20dbebc7ee
Author: Sidd <[email protected]>
AuthorDate: Mon Mar 5 15:01:18 2018 -0800

    ARROW-2199: [JAVA] Control the memory allocated for inner vectors in 
containers. (#1646)
    
    * ARROW-2199: [JAVA] Control the memory allocated for inner vector in LIST.
    
    Use density based setInitialCapacity and propagate density down the vector
    tree from complex vectors. Also ensure that density driven initial capacity
    is never less than 1.
    
    * comment cleanup
    
    * use Integer.MAX_VALUE instead of 2billion constant
    
    * review comments
    
    * review comments
    
    * address review comments for realloc and tests
---
 .../org/apache/arrow/memory/BaseAllocator.java     |  6 +++
 .../src/main/codegen/templates/UnionVector.java    |  1 +
 .../apache/arrow/vector/BaseFixedWidthVector.java  |  1 +
 .../arrow/vector/BaseVariableWidthVector.java      | 10 ++--
 .../apache/arrow/vector/DensityAwareVector.java    | 57 ++++++++++++++++++++++
 .../apache/arrow/vector/VariableWidthVector.java   |  2 +-
 .../vector/complex/AbstractContainerVector.java    |  3 +-
 .../vector/complex/BaseRepeatedValueVector.java    | 20 ++++++--
 .../arrow/vector/complex/FixedSizeListVector.java  |  1 +
 .../apache/arrow/vector/complex/ListVector.java    |  9 +++-
 .../vector/complex/NonNullableStructVector.java    | 11 +++++
 .../arrow/vector/complex/RepeatedValueVector.java  |  3 +-
 .../apache/arrow/vector/complex/StructVector.java  |  7 +++
 .../org/apache/arrow/vector/TestListVector.java    | 23 ++++++---
 .../org/apache/arrow/vector/TestValueVector.java   | 12 ++---
 15 files changed, 137 insertions(+), 29 deletions(-)

diff --git 
a/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java 
b/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java
index 5411baf..2f70f73 100644
--- a/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java
+++ b/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java
@@ -134,6 +134,9 @@ public abstract class BaseAllocator extends Accountant 
implements BufferAllocato
    * @return The closest power of two of that value.
    */
   public static int nextPowerOfTwo(int val) {
+    if (val == 0 || val == 1) {
+      return val + 1;
+    }
     int highestBit = Integer.highestOneBit(val);
     if (highestBit == val) {
       return val;
@@ -149,6 +152,9 @@ public abstract class BaseAllocator extends Accountant 
implements BufferAllocato
    * @return The closest power of two of that value.
    */
   public static long nextPowerOfTwo(long val) {
+    if (val == 0 || val == 1) {
+      return val + 1;
+    }
     long highestBit = Long.highestOneBit(val);
     if (highestBit == val) {
       return val;
diff --git a/java/vector/src/main/codegen/templates/UnionVector.java 
b/java/vector/src/main/codegen/templates/UnionVector.java
index 84450be..1cfa066 100644
--- a/java/vector/src/main/codegen/templates/UnionVector.java
+++ b/java/vector/src/main/codegen/templates/UnionVector.java
@@ -282,6 +282,7 @@ public class UnionVector implements FieldVector {
 
     long newAllocationSize = baseSize * 2L;
     newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize);
+    assert newAllocationSize >= 1;
 
     if (newAllocationSize > BaseValueVector.MAX_ALLOCATION_SIZE) {
       throw new OversizedAllocationException("Unable to expand the buffer");
diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java 
b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java
index cbc56fe..4b47df8 100644
--- 
a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java
+++ 
b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java
@@ -444,6 +444,7 @@ public abstract class BaseFixedWidthVector extends 
BaseValueVector
 
     long newAllocationSize = baseSize * 2L;
     newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize);
+    assert newAllocationSize >= 1;
 
     if (newAllocationSize > MAX_ALLOCATION_SIZE) {
       throw new OversizedAllocationException("Unable to expand the buffer");
diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
 
b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
index c32d20f..ecb3c78 100644
--- 
a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
+++ 
b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
@@ -174,14 +174,14 @@ public abstract class BaseVariableWidthVector extends 
BaseValueVector
    * @param valueCount desired number of elements in the vector
    * @param density average number of bytes per variable width element
    */
+  @Override
   public void setInitialCapacity(int valueCount, double density) {
-    final long size = (long) (valueCount * density);
-    if (size < 1) {
-      throw new IllegalArgumentException("With the provided density and value 
count, potential capacity of the data buffer is 0");
-    }
+    long size = Math.max((long)(valueCount * density), 1L);
+
     if (size > MAX_ALLOCATION_SIZE) {
       throw new OversizedAllocationException("Requested amount of memory is 
more than max allowed");
     }
+
     valueAllocationSizeInBytes = (int) size;
     validityAllocationSizeInBytes = getValidityBufferSizeFromCount(valueCount);
     /* to track the end offset of last data element in vector, we need
@@ -489,6 +489,7 @@ public abstract class BaseVariableWidthVector extends 
BaseValueVector
 
     long newAllocationSize = baseSize * 2L;
     newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize);
+    assert newAllocationSize >= 1;
 
     if (newAllocationSize > MAX_ALLOCATION_SIZE) {
       throw new OversizedAllocationException("Unable to expand the buffer");
@@ -541,6 +542,7 @@ public abstract class BaseVariableWidthVector extends 
BaseValueVector
 
     long newAllocationSize = baseSize * 2L;
     newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize);
+    assert newAllocationSize >= 1;
 
     if (newAllocationSize > MAX_ALLOCATION_SIZE) {
       throw new OversizedAllocationException("Unable to expand the buffer");
diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java 
b/java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java
new file mode 100644
index 0000000..2915661
--- /dev/null
+++ b/java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.vector;
+
+/**
+ * Vector that support density aware initial capacity settings.
+ * We use this for ListVector and VarCharVector as of now to
+ * control the memory allocated.
+ *
+ * For ListVector, we have been using a multiplier of 5
+ * to compute the initial capacity of the inner data vector.
+ * For deeply nested lists and lists with lots of NULL values,
+ * this is over-allocation upfront. So density helps to be
+ * conservative when computing the value capacity of the
+ * inner vector.
+ *
+ * For example, a density value of 10 implies each position in the
+ * list vector has a list of 10 values. So we will provision
+ * an initial capacity of (valuecount * 10) for the inner vector.
+ * A density value of 0.1 implies out of 10 positions in the list vector,
+ * 1 position has a list of size 1 and remaining positions are
+ * null (no lists) or empty lists. This helps in tightly controlling
+ * the memory we provision for inner data vector.
+ *
+ * Similar analogy is applicable for VarCharVector where the capacity
+ * of the data buffer can be controlled using density multiplier
+ * instead of default multiplier of 8 (default size of average
+ * varchar length).
+ *
+ * Also from container vectors, we propagate the density down
+ * the the inner vectors so that they can use it appropriately.
+ */
+public interface DensityAwareVector {
+
+  /**
+   * Set value with density
+   * @param valueCount
+   * @param density
+   */
+  void setInitialCapacity(int valueCount, double density);
+}
diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java 
b/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java
index 593d4dc..f91a5c8 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java
@@ -18,7 +18,7 @@
 
 package org.apache.arrow.vector;
 
-public interface VariableWidthVector extends ValueVector {
+public interface VariableWidthVector extends ValueVector, DensityAwareVector {
 
   /**
    * Allocate a new memory space for this vector.  Must be called prior to 
using the ValueVector.
diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java
 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java
index a99f3c8..d2a3c4a 100644
--- 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java
+++ 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java
@@ -20,6 +20,7 @@ package org.apache.arrow.vector.complex;
 
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.memory.OutOfMemoryException;
+import org.apache.arrow.vector.DensityAwareVector;
 import org.apache.arrow.vector.FieldVector;
 import org.apache.arrow.vector.ValueVector;
 import org.apache.arrow.vector.types.Types.MinorType;
@@ -33,7 +34,7 @@ import org.apache.arrow.vector.util.CallBack;
  *
  * This class implements common functionality of composite vectors.
  */
-public abstract class AbstractContainerVector implements ValueVector {
+public abstract class AbstractContainerVector implements ValueVector, 
DensityAwareVector {
   static final org.slf4j.Logger logger = 
org.slf4j.LoggerFactory.getLogger(AbstractContainerVector.class);
 
   protected final String name;
diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
index 50ee3a7..2dd2894 100644
--- 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
+++ 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
@@ -25,6 +25,7 @@ import org.apache.arrow.memory.BaseAllocator;
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.vector.AddOrGetResult;
 import org.apache.arrow.vector.BaseValueVector;
+import org.apache.arrow.vector.DensityAwareVector;
 import org.apache.arrow.vector.FieldVector;
 import org.apache.arrow.vector.UInt4Vector;
 import org.apache.arrow.vector.ValueVector;
@@ -108,6 +109,7 @@ public abstract class BaseRepeatedValueVector extends 
BaseValueVector implements
 
     long newAllocationSize = baseSize * 2L;
     newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize);
+    assert newAllocationSize >= 1;
 
     if (newAllocationSize > MAX_ALLOCATION_SIZE) {
       throw new OversizedAllocationException("Unable to expand the buffer");
@@ -139,7 +141,7 @@ public abstract class BaseRepeatedValueVector extends 
BaseValueVector implements
     if (vector instanceof BaseFixedWidthVector || vector instanceof 
BaseVariableWidthVector) {
       vector.setInitialCapacity(numRecords * 
RepeatedValueVector.DEFAULT_REPEAT_PER_RECORD);
     } else {
-     vector.setInitialCapacity(numRecords);
+      vector.setInitialCapacity(numRecords);
     }
   }
 
@@ -166,13 +168,21 @@ public abstract class BaseRepeatedValueVector extends 
BaseValueVector implements
    *                This helps in tightly controlling the memory we provision
    *                for inner data vector.
    */
+  @Override
   public void setInitialCapacity(int numRecords, double density) {
+    if ((numRecords * density) >= Integer.MAX_VALUE) {
+      throw new OversizedAllocationException("Requested amount of memory is 
more than max allowed");
+    }
+
     offsetAllocationSizeInBytes = (numRecords + 1) * OFFSET_WIDTH;
-    final int innerValueCapacity = (int)(numRecords * density);
-    if (innerValueCapacity < 1) {
-      throw new IllegalArgumentException("With the provided density and value 
count, potential value capacity for the data vector is 0");
+
+    int innerValueCapacity = Math.max((int)(numRecords * density), 1);
+
+    if (vector instanceof DensityAwareVector) {
+      ((DensityAwareVector)vector).setInitialCapacity(innerValueCapacity, 
density);
+    } else {
+      vector.setInitialCapacity(innerValueCapacity);
     }
-    vector.setInitialCapacity(innerValueCapacity);
   }
 
   @Override
diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java
 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java
index 9314a25..eadbab4 100644
--- 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java
+++ 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java
@@ -210,6 +210,7 @@ public class FixedSizeListVector extends BaseValueVector 
implements FieldVector,
 
     long newAllocationSize = baseSize * 2L;
     newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize);
+    assert newAllocationSize >= 1;
 
     if (newAllocationSize > MAX_ALLOCATION_SIZE) {
       throw new OversizedAllocationException("Unable to expand the buffer");
diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
index b472dae..d3eeaf2 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
@@ -31,7 +31,12 @@ import io.netty.buffer.ArrowBuf;
 import org.apache.arrow.memory.BaseAllocator;
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.memory.OutOfMemoryException;
-import org.apache.arrow.vector.*;
+import org.apache.arrow.vector.AddOrGetResult;
+import org.apache.arrow.vector.BufferBacked;
+import org.apache.arrow.vector.FieldVector;
+import org.apache.arrow.vector.ValueVector;
+import org.apache.arrow.vector.ZeroVector;
+import org.apache.arrow.vector.BitVectorHelper;
 import org.apache.arrow.vector.complex.impl.ComplexCopier;
 import org.apache.arrow.vector.complex.impl.UnionListReader;
 import org.apache.arrow.vector.complex.impl.UnionListWriter;
@@ -126,6 +131,7 @@ public class ListVector extends BaseRepeatedValueVector 
implements FieldVector,
    *                This helps in tightly controlling the memory we provision
    *                for inner data vector.
    */
+  @Override
   public void setInitialCapacity(int numRecords, double density) {
     validityAllocationSizeInBytes = getValidityBufferSizeFromCount(numRecords);
     super.setInitialCapacity(numRecords, density);
@@ -287,6 +293,7 @@ public class ListVector extends BaseRepeatedValueVector 
implements FieldVector,
 
     long newAllocationSize = baseSize * 2L;
     newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize);
+    assert newAllocationSize >= 1;
 
     if (newAllocationSize > MAX_ALLOCATION_SIZE) {
       throw new OversizedAllocationException("Unable to expand the buffer");
diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java
 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java
index aec06b6..c41cbf2 100644
--- 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java
+++ 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java
@@ -100,6 +100,17 @@ public class NonNullableStructVector extends 
AbstractStructVector {
   }
 
   @Override
+  public void setInitialCapacity(int valueCount, double density) {
+    for (final ValueVector vector : (Iterable<ValueVector>) this) {
+      if (vector instanceof DensityAwareVector) {
+        ((DensityAwareVector)vector).setInitialCapacity(valueCount, density);
+      } else {
+        vector.setInitialCapacity(valueCount);
+      }
+    }
+  }
+
+  @Override
   public int getBufferSize() {
     if (valueCount == 0 || size() == 0) {
       return 0;
diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java
 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java
index 3640117..a7f6d43 100644
--- 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java
+++ 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java
@@ -18,6 +18,7 @@
 
 package org.apache.arrow.vector.complex;
 
+import org.apache.arrow.vector.DensityAwareVector;
 import org.apache.arrow.vector.UInt4Vector;
 import org.apache.arrow.vector.ValueVector;
 
@@ -28,7 +29,7 @@ import org.apache.arrow.vector.ValueVector;
  * Current design maintains data and offsets vectors. Each cell is stored in 
the data vector. Repeated vector
  * uses the offset vector to determine the sequence of cells pertaining to an 
individual value.
  */
-public interface RepeatedValueVector extends ValueVector {
+public interface RepeatedValueVector extends ValueVector, DensityAwareVector {
 
   final static int DEFAULT_REPEAT_PER_RECORD = 5;
 
diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java
index 26bb4bb..05571bb 100644
--- 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java
+++ 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java
@@ -356,6 +356,12 @@ public class StructVector extends NonNullableStructVector 
implements FieldVector
   }
 
   @Override
+  public void setInitialCapacity(int numRecords, double density) {
+    validityAllocationSizeInBytes = 
BitVectorHelper.getValidityBufferSize(numRecords);
+    super.setInitialCapacity(numRecords, density);
+  }
+
+  @Override
   public boolean allocateNewSafe() {
     /* Boolean to keep track if all the memory allocations were successful
      * Used in the case of composite vectors when we need to allocate multiple
@@ -401,6 +407,7 @@ public class StructVector extends NonNullableStructVector 
implements FieldVector
 
     long newAllocationSize = baseSize * 2L;
     newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize);
+    assert newAllocationSize >= 1;
 
     if (newAllocationSize > BaseValueVector.MAX_ALLOCATION_SIZE) {
       throw new OversizedAllocationException("Unable to expand the buffer");
diff --git 
a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java 
b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
index d49a677..d12586e 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
@@ -811,14 +811,21 @@ public class TestListVector {
       assertEquals(512, vector.getValueCapacity());
       assertEquals(8, vector.getDataVector().getValueCapacity());
 
-      boolean error = false;
-      try {
-        vector.setInitialCapacity(5, 0.1);
-      } catch (IllegalArgumentException e) {
-        error = true;
-      } finally {
-        assertTrue(error);
-      }
+      /**
+       * inner value capacity we pass to data vector is 5 * 0.1 => 0
+       * which is then rounded off to 1. So we pass value count as 1
+       * to the inner int vector.
+       * the offset buffer of the list vector is allocated for 6 values
+       * which is 24 bytes and then rounded off to 32 bytes (8 values)
+       * the validity buffer of the list vector is allocated for 5
+       * values which is 1 byte. This is why value capacity of the list
+       * vector is 7 as we take the min of validity buffer value capacity
+       * and offset buffer value capacity.
+       */
+      vector.setInitialCapacity(5, 0.1);
+      vector.allocateNew();
+      assertEquals(7, vector.getValueCapacity());
+      assertEquals(1, vector.getDataVector().getValueCapacity());
     }
   }
 }
diff --git 
a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java 
b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
index 992bb62..5104962 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
@@ -1934,14 +1934,10 @@ public class TestValueVector {
       assertEquals(4096, vector.getValueCapacity());
       assertEquals(64, vector.getDataBuffer().capacity());
 
-      boolean error = false;
-      try {
-        vector.setInitialCapacity(5, 0.1);
-      } catch (IllegalArgumentException e) {
-        error = true;
-      } finally {
-        assertTrue(error);
-      }
+      vector.setInitialCapacity(5, 0.01);
+      vector.allocateNew();
+      assertEquals(7, vector.getValueCapacity());
+      assertEquals(2, vector.getDataBuffer().capacity());
     }
   }
 }

-- 
To stop receiving notification emails like this one, please contact
[email protected].

Reply via email to