vibhatha commented on code in PR #40340:
URL: https://github.com/apache/arrow/pull/40340#discussion_r1555405261


##########
java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java:
##########
@@ -0,0 +1,1464 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.vector;
+
+import static org.apache.arrow.memory.util.LargeMemoryUtil.capAtMaxInt;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.arrow.memory.ArrowBuf;
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.memory.OutOfMemoryException;
+import org.apache.arrow.memory.util.ArrowBufPointer;
+import org.apache.arrow.memory.util.ByteFunctionHelpers;
+import org.apache.arrow.memory.util.CommonUtil;
+import org.apache.arrow.memory.util.hash.ArrowBufHasher;
+import org.apache.arrow.vector.compare.VectorVisitor;
+import org.apache.arrow.vector.ipc.message.ArrowFieldNode;
+import org.apache.arrow.vector.types.pojo.Field;
+import org.apache.arrow.vector.util.CallBack;
+import org.apache.arrow.vector.util.OversizedAllocationException;
+import org.apache.arrow.vector.util.TransferPair;
+
+/**
+ * BaseVariableWidthViewVector is a base class providing functionality for 
strings/bytes types in view format.
+ *
+ */
+public abstract class BaseVariableWidthViewVector extends BaseValueVector 
implements AbstractVariableWidthVector {
+  // A single element of a view comprises 16 bytes
+  protected static final int VIEW_BUFFER_SIZE = 16;
+  public static final int INITIAL_VIEW_VALUE_ALLOCATION = 4096;
+  private static final int INITIAL_BYTE_COUNT = INITIAL_VIEW_VALUE_ALLOCATION 
* VIEW_BUFFER_SIZE;
+  private static final int MAX_BUFFER_SIZE = (int) 
Math.min(MAX_ALLOCATION_SIZE, Integer.MAX_VALUE);
+  private int lastValueCapacity;
+  private long lastValueAllocationSizeInBytes;
+
+  /*
+  * Variable Width View Vector comprises the following format
+  *
+  * Short strings, length <= 12
+  * | Bytes 0-3  | Bytes 4-15                            |
+  * |------------|---------------------------------------|
+  * | length     | data (padded with 0)                  |
+  * |------------|---------------------------------------|
+  *
+  * Long strings, length > 12
+  * | Bytes 0-3  | Bytes 4-7  | Bytes 8-11 | Bytes 12-15 |
+  * |------------|------------|------------|-------------|
+  * | length     | prefix     | buf.index  | offset      |
+  * |------------|------------|------------|-------------|
+  *
+  * */
+  // 12 byte unsigned int to track inline views
+  protected static final int INLINE_SIZE = 12;
+  // The first 4 bytes of view are allocated for length
+  protected static final int LENGTH_WIDTH = 4;
+  // The second 4 bytes of view are allocated for prefix width
+  protected static final int PREFIX_WIDTH = 4;
+  // The third 4 bytes of view are allocated for buffer index
+  protected static final int BUF_INDEX_WIDTH = 4; /* third 4 bytes of view are 
allocated for buffer index*/
+  protected static final byte[] emptyByteArray = new byte[]{};
+  protected ArrowBuf validityBuffer;
+  // The view buffer is used to store the variable width view elements
+  protected ArrowBuf viewBuffer;
+  // The external buffer which stores the long strings
+  protected List<ArrowBuf> dataBuffers;
+  protected int initialDataBufferSize;
+  protected int valueCount;
+  protected int lastSet;
+  protected final Field field;
+
+
+  /**
+   * Constructs a new instance.
+   *
+   * @param field The field materialized by this vector
+   * @param allocator The allocator to use for creating/resizing buffers
+   */
+  public BaseVariableWidthViewVector(Field field, final BufferAllocator 
allocator) {
+    super(allocator);
+    this.field = field;
+    lastValueAllocationSizeInBytes = INITIAL_BYTE_COUNT;
+    lastValueCapacity = INITIAL_VIEW_VALUE_ALLOCATION;
+    valueCount = 0;
+    lastSet = -1;
+    validityBuffer = allocator.getEmpty();
+    viewBuffer = allocator.getEmpty();
+    dataBuffers = new ArrayList<>();
+  }
+
+  @Override
+  public String getName() {
+    return field.getName();
+  }
+
+  /* TODO:
+   * see if getNullCount() can be made faster -- O(1)
+   */
+
+  /* TODO:
+   * Once the entire hierarchy has been refactored, move common functions
+   * like getNullCount(), splitAndTransferValidityBuffer to top level
+   * base class BaseValueVector.
+   *
+   * Along with this, some class members (validityBuffer) can also be
+   * abstracted out to top level base class.
+   *
+   * Right now BaseValueVector is the top level base class for other
+   * vector types in ValueVector hierarchy (non-nullable) and those
+   * vectors have not yet been refactored/removed so moving things to
+   * the top class as of now is not a good idea.
+   */
+
+  /* TODO:
+   * Implement TransferPair functionality
+   * https://github.com/apache/arrow/issues/40932
+   *
+   */
+
+  /**
+   * Get buffer that manages the validity (NULL or NON-NULL nature) of
+   * elements in the vector. Consider it as a buffer for internal bit vector
+   * data structure.
+   *
+   * @return buffer
+   */
+  @Override
+  public ArrowBuf getValidityBuffer() {
+    return validityBuffer;
+  }
+
+  /**
+   * Get the buffer that stores the data for elements in the vector.
+   *
+   * @return buffer
+   */
+  @Override
+  public ArrowBuf getDataBuffer() {
+    return viewBuffer;
+  }
+
+  /**
+   * BaseVariableWidthViewVector doesn't support offset buffer.
+   *
+   * @return throws UnsupportedOperationException
+   */
+  @Override
+  public ArrowBuf getOffsetBuffer() {
+    throw new UnsupportedOperationException("Offset buffer is not supported in 
BaseVariableWidthViewVector");
+  }
+
+  /**
+   * BaseVariableWidthViewVector doesn't support offset buffer.
+   *
+   * @return throws UnsupportedOperationException
+   */
+  @Override
+  public long getOffsetBufferAddress() {
+    throw new UnsupportedOperationException("Offset buffer is not supported in 
BaseVariableWidthViewVector");
+  }
+
+  /**
+   * Get the memory address of buffer that manages the validity
+   * (NULL or NON-NULL nature) of elements in the vector.
+   *
+   * @return starting address of the buffer
+   */
+  @Override
+  public long getValidityBufferAddress() {
+    return validityBuffer.memoryAddress();
+  }
+
+  /**
+   * Get the memory address of buffer that stores the data for elements
+   * in the vector.
+   *
+   * @return starting address of the buffer
+   */
+  @Override
+  public long getDataBufferAddress() {
+    return viewBuffer.memoryAddress();
+  }
+
+  /**
+   * Sets the desired value capacity for the vector. This function doesn't
+   * allocate any memory for the vector.
+   *
+   * @param valueCount desired number of elements in the vector
+   */
+  @Override
+  public void setInitialCapacity(int valueCount) {
+    final long size = (long) valueCount * VIEW_BUFFER_SIZE;
+    checkDataBufferSize(size);
+    lastValueAllocationSizeInBytes = (int) size;
+    lastValueCapacity = valueCount;
+  }
+
+  /**
+   * Sets the desired value capacity for the vector. This function doesn't
+   * allocate any memory for the vector.
+   *
+   * @param valueCount desired number of elements in the vector
+   * @param density average number of bytes per variable width view element
+   */
+  @Override
+  public void setInitialCapacity(int valueCount, double density) {
+    final long size = (long) valueCount * VIEW_BUFFER_SIZE;
+    initialDataBufferSize = (int) (valueCount * density);
+    checkDataBufferSize(size);
+    lastValueAllocationSizeInBytes = (int) size;
+    lastValueCapacity = valueCount;
+  }
+
+  /**
+   * Get the density of this ListVector.
+   * @return density
+   */
+  public double getDensity() {
+    if (valueCount == 0) {
+      return 0.0D;
+    }
+    final double totalListSize = getTotalLengthUptoIndex(valueCount);
+    return totalListSize / valueCount;
+  }
+
+  /**
+   * Get the current capacity which does not exceed either validity buffer or 
value buffer.
+   * Note: Here the `getValueCapacity` has a relationship with the value 
buffer.
+   *
+   * @return number of elements that vector can hold.
+   */
+  @Override
+  public int getValueCapacity() {
+    final int validityCapacity = getValidityBufferValueCapacity();
+    final int valueBufferCapacity = Math.max(capAtMaxInt(viewBuffer.capacity() 
/ VIEW_BUFFER_SIZE), 0);
+    return Math.min(valueBufferCapacity, validityCapacity);
+  }
+
+  private int getValidityBufferValueCapacity() {
+    return capAtMaxInt(validityBuffer.capacity() * 8);
+  }
+
+  /**
+   * zero out the vector and the data in associated buffers.
+   */
+  public void zeroVector() {
+    initValidityBuffer();
+    viewBuffer.setZero(0, viewBuffer.capacity());
+  }
+
+  /* zero out the validity buffer */
+  private void initValidityBuffer() {
+    validityBuffer.setZero(0, validityBuffer.capacity());
+  }
+
+  /**
+   * Reset the vector to initial state. Same as {@link #zeroVector()}.
+   * Note that this method doesn't release any memory.
+   */
+  @Override
+  public void reset() {
+    zeroVector();
+    lastSet = -1;
+    valueCount = 0;
+  }
+
+  /**
+   * Close the vector and release the associated buffers.
+   */
+  @Override
+  public void close() {
+    clear();
+  }
+
+  /**
+   * Same as {@link #close()}.
+   */
+  @Override
+  public void clear() {
+    validityBuffer = releaseBuffer(validityBuffer);
+    viewBuffer = releaseBuffer(viewBuffer);
+    clearDataBuffers();
+    lastSet = -1;
+    valueCount = 0;
+  }
+
+  /**
+  * Release the data buffers and clear the list.
+  */
+  public void clearDataBuffers() {
+    for (ArrowBuf buffer : dataBuffers) {
+      buffer.getReferenceManager().release();
+    }
+    dataBuffers.clear();
+  }
+
+  /**
+   * Get the inner vectors.
+   *
+   * @deprecated This API will be removed as the current implementations no 
longer support inner vectors.

Review Comment:
   Sure I can do that.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to