This is an automated email from the ASF dual-hosted git repository.

lidavidm pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 38922eded5 GH-37703: [Java] Method for setting exact number of records 
in ListVector (#37838)
38922eded5 is described below

commit 38922eded5797afca8ade33145bf59140ada1663
Author: James Duong <[email protected]>
AuthorDate: Tue Sep 26 05:44:53 2023 -0700

    GH-37703: [Java] Method for setting exact number of records in ListVector 
(#37838)
    
    ### Rationale for this change
    There is currently a setInitialCapacity() function that can be used to set 
a number of records and density factor when setting the capacity on a 
ListVector. A developer may want to specify the exact total number of records 
instead and can use the new methods introduced here.
    
    ### What changes are included in this PR?
    
    Add setInitialTotalCapacity() to BaseRepeatedVector, ListVector, 
DensityAwareVector, and LargeListVector to specify the exact total number of 
records in the backing vector.
    
    This is an alternative to using the density argument in 
setInitialCapacity() that allows the caller to precisely specify the capacity.
    
    ### Are these changes tested?
    Yes.
    
    ### Are there any user-facing changes?
    No.
    
    * Closes: #37703
    
    Authored-by: James Duong <[email protected]>
    Signed-off-by: David Li <[email protected]>
---
 .../vector/complex/BaseRepeatedValueVector.java    | 21 +++++++++++++++++++++
 .../arrow/vector/complex/LargeListVector.java      | 21 +++++++++++++++++++++
 .../apache/arrow/vector/complex/ListVector.java    | 22 ++++++++++++++++++++++
 .../apache/arrow/vector/TestLargeListVector.java   | 20 ++++++++++++++++++++
 .../org/apache/arrow/vector/TestListVector.java    | 20 ++++++++++++++++++++
 5 files changed, 104 insertions(+)

diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
index 62d4a1299d..95deceb4e7 100644
--- 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
+++ 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
@@ -205,6 +205,27 @@ public abstract class BaseRepeatedValueVector extends 
BaseValueVector implements
     }
   }
 
+  /**
+   * Specialized version of setInitialTotalCapacity() for ListVector. This is
+   * used by some callers when they want to explicitly control and be
+   * conservative about memory allocated for inner data vector. This is
+   * very useful when we are working with memory constraints for a query
+   * and have a fixed amount of memory reserved for the record batch. In
+   * such cases, we are likely to face OOM or related problems when
+   * we reserve memory for a record batch with value count x and
+   * do setInitialCapacity(x) such that each vector allocates only
+   * what is necessary and not the default amount but the multiplier
+   * forces the memory requirement to go beyond what was needed.
+   *
+   * @param numRecords value count
+   * @param totalNumberOfElements the total number of elements to to allow
+   *                              for in this vector across all records.
+   */
+  public void setInitialTotalCapacity(int numRecords, int 
totalNumberOfElements) {
+    offsetAllocationSizeInBytes = (numRecords + 1) * OFFSET_WIDTH;
+    vector.setInitialCapacity(totalNumberOfElements);
+  }
+
   @Override
   public int getValueCapacity() {
     final int offsetValueCapacity = Math.max(getOffsetBufferValueCapacity() - 
1, 0);
diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java
 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java
index 6ef5f994fc..acb058cda3 100644
--- 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java
+++ 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java
@@ -196,6 +196,27 @@ public class LargeListVector extends BaseValueVector 
implements RepeatedValueVec
     }
   }
 
+  /**
+   * Specialized version of setInitialTotalCapacity() for ListVector. This is
+   * used by some callers when they want to explicitly control and be
+   * conservative about memory allocated for inner data vector. This is
+   * very useful when we are working with memory constraints for a query
+   * and have a fixed amount of memory reserved for the record batch. In
+   * such cases, we are likely to face OOM or related problems when
+   * we reserve memory for a record batch with value count x and
+   * do setInitialCapacity(x) such that each vector allocates only
+   * what is necessary and not the default amount but the multiplier
+   * forces the memory requirement to go beyond what was needed.
+   *
+   * @param numRecords value count
+   * @param totalNumberOfElements the total number of elements to to allow
+   *                              for in this vector across all records.
+   */
+  public void setInitialTotalCapacity(int numRecords, int 
totalNumberOfElements) {
+    offsetAllocationSizeInBytes = (numRecords + 1) * OFFSET_WIDTH;
+    vector.setInitialCapacity(totalNumberOfElements);
+  }
+
   /**
    * Get the density of this ListVector.
    * @return density
diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
index 52e5307e13..0d6ff11f8c 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
@@ -148,6 +148,28 @@ public class ListVector extends BaseRepeatedValueVector 
implements PromotableVec
     super.setInitialCapacity(numRecords, density);
   }
 
+  /**
+   * Specialized version of setInitialTotalCapacity() for ListVector. This is
+   * used by some callers when they want to explicitly control and be
+   * conservative about memory allocated for inner data vector. This is
+   * very useful when we are working with memory constraints for a query
+   * and have a fixed amount of memory reserved for the record batch. In
+   * such cases, we are likely to face OOM or related problems when
+   * we reserve memory for a record batch with value count x and
+   * do setInitialCapacity(x) such that each vector allocates only
+   * what is necessary and not the default amount but the multiplier
+   * forces the memory requirement to go beyond what was needed.
+   *
+   * @param numRecords value count
+   * @param totalNumberOfElements the total number of elements to to allow
+   *                              for in this vector across all records.
+   */
+  @Override
+  public void setInitialTotalCapacity(int numRecords, int 
totalNumberOfElements) {
+    validityAllocationSizeInBytes = getValidityBufferSizeFromCount(numRecords);
+    super.setInitialTotalCapacity(numRecords, totalNumberOfElements);
+  }
+
   /**
    * Get the density of this ListVector.
    * @return density
diff --git 
a/java/vector/src/test/java/org/apache/arrow/vector/TestLargeListVector.java 
b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeListVector.java
index c1d60da4d5..adf86183c0 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestLargeListVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeListVector.java
@@ -972,6 +972,26 @@ public class TestLargeListVector {
     }
   }
 
+  @Test
+  public void testTotalCapacity() {
+    final FieldType type = FieldType.nullable(MinorType.INT.getType());
+    try (final LargeListVector vector = new LargeListVector("list", allocator, 
type, null)) {
+      // Force the child vector to be allocated based on the type
+      // (this is a bad API: we have to track and repeat the type twice)
+      vector.addOrGetVector(type);
+
+      // Specify the allocation size but do not actually allocate
+      vector.setInitialTotalCapacity(10, 100);
+
+      // Finally actually do the allocation
+      vector.allocateNewSafe();
+
+      // Note: allocator rounds up and can be greater than the requested 
allocation.
+      assertTrue(vector.getValueCapacity() >= 10);
+      assertTrue(vector.getDataVector().getValueCapacity() >= 100);
+    }
+  }
+
   private void writeIntValues(UnionLargeListWriter writer, int[] values) {
     writer.startList();
     for (int v: values) {
diff --git 
a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java 
b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
index f0f19058ee..2a1228c2a3 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
@@ -1115,6 +1115,26 @@ public class TestListVector {
     }
   }
 
+  @Test
+  public void testTotalCapacity() {
+    final FieldType type = FieldType.nullable(MinorType.INT.getType());
+    try (final ListVector vector = new ListVector("list", allocator, type, 
null)) {
+      // Force the child vector to be allocated based on the type
+      // (this is a bad API: we have to track and repeat the type twice)
+      vector.addOrGetVector(type);
+
+      // Specify the allocation size but do not actually allocate
+      vector.setInitialTotalCapacity(10, 100);
+
+      // Finally actually do the allocation
+      vector.allocateNewSafe();
+
+      // Note: allocator rounds up and can be greater than the requested 
allocation.
+      assertTrue(vector.getValueCapacity() >= 10);
+      assertTrue(vector.getDataVector().getValueCapacity() >= 100);
+    }
+  }
+
   private void writeIntValues(UnionListWriter writer, int[] values) {
     writer.startList();
     for (int v: values) {

Reply via email to