This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 61b2926  ARROW-4532: [Java] fix bug causing very large varchar value 
buffers
61b2926 is described below

commit 61b2926fdbd07dae3bfefae6aabdb2dc4a18f48c
Author: Pindikura Ravindra <[email protected]>
AuthorDate: Tue Feb 12 15:35:56 2019 +0100

    ARROW-4532: [Java] fix bug causing very large varchar value buffers
    
    The varchar/varbinary vectors have a setSafe variant that takes an arrow 
buf and a start/end offsets. The method needs to expand the buffer to make 
space for 'end - start' bytes. but, due to a bug, it was expanding it to 
accommodate 'end' bytes, thereby making the value buffer much larger than 
required.
    
    Author: Pindikura Ravindra <[email protected]>
    
    Closes #3613 from pravindra/varchar and squashes the following commits:
    
    1b4f2243 <Pindikura Ravindra> ARROW-4532: fix review comments
    8f88e3aa <Pindikura Ravindra> ARROW-4532:  fix a size compute bug
---
 .../arrow/vector/BaseVariableWidthVector.java      |  2 +-
 .../org/apache/arrow/vector/TestValueVector.java   | 45 ++++++++++++++++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
 
b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
index ac148a2..a5b550a 100644
--- 
a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
+++ 
b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
@@ -1100,7 +1100,7 @@ public abstract class BaseVariableWidthVector extends 
BaseValueVector
     assert index >= 0;
     final int dataLength = end - start;
     fillEmpties(index);
-    handleSafe(index, end);
+    handleSafe(index, dataLength);
     BitVectorHelper.setValidityBit(validityBuffer, index, isSet);
     final int startOffset = offsetBuffer.getInt(index * OFFSET_WIDTH);
     offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + dataLength);
diff --git 
a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java 
b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
index 30fe23c..f0cc4c4 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
@@ -1395,6 +1395,51 @@ public class TestValueVector {
   }
 
   @Test
+  public void testSetSafeWithArrowBufNoExcessAllocs() {
+    final int numValues = BaseFixedWidthVector.INITIAL_VALUE_ALLOCATION * 2;
+    final byte[] valueBytes = "hello world".getBytes();
+    final int valueBytesLength = valueBytes.length;
+    final int isSet = 1;
+
+    try (
+        final VarCharVector fromVector = newVector(VarCharVector.class, 
EMPTY_SCHEMA_PATH,
+            MinorType.VARCHAR, allocator);
+        final VarCharVector toVector = newVector(VarCharVector.class, 
EMPTY_SCHEMA_PATH,
+            MinorType.VARCHAR, allocator)) {
+      /*
+       * Populate the from vector with 'numValues' with byte-arrays, each of 
size 'valueBytesLength'.
+       */
+      fromVector.setInitialCapacity(numValues);
+      fromVector.allocateNew();
+      for (int i = 0; i < numValues; ++i) {
+        fromVector.setSafe(i, valueBytes, 0 /*start*/, valueBytesLength);
+      }
+      fromVector.setValueCount(numValues);
+      ArrowBuf fromDataBuffer = fromVector.getDataBuffer();
+      assertTrue(numValues * valueBytesLength <= fromDataBuffer.capacity());
+
+      /*
+       * Copy the entries one-by-one from 'fromVector' to 'toVector', but use 
the setSafe with
+       * ArrowBuf API (instead of setSafe with byte-array).
+       */
+      toVector.setInitialCapacity(numValues);
+      toVector.allocateNew();
+      for (int i = 0; i < numValues; i++) {
+        int start = fromVector.getstartOffset(i);
+        int end = fromVector.getstartOffset(i + 1);
+        toVector.setSafe(i, isSet, start, end, fromDataBuffer);
+      }
+
+      /*
+       * Since the 'fromVector' and 'toVector' have the same initial capacity, 
and were populated
+       * with the same varchar elements, the allocations and hence, the final 
capacity should be
+       * the same.
+       */
+      assertEquals(fromDataBuffer.capacity(), 
toVector.getDataBuffer().capacity());
+    }
+  }
+
+  @Test
   public void testCopyFromWithNulls() {
     try (final VarCharVector vector = newVector(VarCharVector.class, 
EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator);
          final VarCharVector vector2 =

Reply via email to