vibhatha commented on code in PR #40340: URL: https://github.com/apache/arrow/pull/40340#discussion_r1551960595
########## java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java: ########## @@ -0,0 +1,1570 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.memory.util.LargeMemoryUtil.capAtMaxInt; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.memory.util.ArrowBufPointer; +import org.apache.arrow.memory.util.ByteFunctionHelpers; +import org.apache.arrow.memory.util.CommonUtil; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.util.TransferPair; + +/** + * BaseVariableWidthViewVector is a base class providing functionality for strings/bytes types in view format. + * + */ +public abstract class BaseVariableWidthViewVector extends AbstractVariableWidthVector { + private static final int DEFAULT_RECORD_BYTE_COUNT = 16; + private static final int INITIAL_BYTE_COUNT = INITIAL_VALUE_ALLOCATION * DEFAULT_RECORD_BYTE_COUNT; + private static final int MAX_BUFFER_SIZE = (int) Math.min(MAX_ALLOCATION_SIZE, Integer.MAX_VALUE); + private int lastValueCapacity; + private long lastValueAllocationSizeInBytes; + + public static final int OFFSET_WIDTH = 4; /* 4 byte unsigned int to track offsets */ + /* protected members */ + protected static final int INLINE_SIZE = 12; /* 12 byte unsigned int to track inline views*/ + protected static final int VIEW_BUFFER_SIZE = 16; /* 16 byte default size for each view*/ + protected static final int LENGTH_WIDTH = 4; /* the first 4 bytes of view are allocated for length*/ + protected static final int PREFIX_WIDTH = 4; /* the second 4 bytes of view are allocated for prefix width*/ + protected static final int BUF_INDEX_WIDTH = 4; /* third 4 bytes of view are allocated for buffer index*/ + protected static final byte[] emptyByteArray = new byte[]{}; + protected ArrowBuf validityBuffer; + protected ArrowBuf valueBuffer; + protected ArrowBuf offsetBuffer; + protected int valueCount; + protected int lastSet; + protected final Field field; + protected List<ArrowBuf> dataBuffers; + + /** + * Constructs a new instance. + * + * @param field The field materialized by this vector + * @param allocator The allocator to use for creating/resizing buffers + */ + public BaseVariableWidthViewVector(Field field, final BufferAllocator allocator) { + super(allocator); + this.field = field; + lastValueAllocationSizeInBytes = INITIAL_BYTE_COUNT; + // -1 because we require one extra slot for the offset array. + lastValueCapacity = INITIAL_VALUE_ALLOCATION - 1; + valueCount = 0; + lastSet = -1; + offsetBuffer = allocator.getEmpty(); + validityBuffer = allocator.getEmpty(); + valueBuffer = allocator.getEmpty(); + dataBuffers = new ArrayList<>(); + } + + @Override + public String getName() { + return field.getName(); + } + + /* TODO: + * see if getNullCount() can be made faster -- O(1) + */ + + /* TODO: + * Once the entire hierarchy has been refactored, move common functions + * like getNullCount(), splitAndTransferValidityBuffer to top level + * base class BaseValueVector. + * + * Along with this, some class members (validityBuffer) can also be + * abstracted out to top level base class. + * + * Right now BaseValueVector is the top level base class for other + * vector types in ValueVector hierarchy (non-nullable) and those + * vectors have not yet been refactored/removed so moving things to + * the top class as of now is not a good idea. + */ + + /* TODO: + * Implement TransferPair functionality + * https://github.com/apache/arrow/issues/40932 + * + */ + + /** + * Get buffer that manages the validity (NULL or NON-NULL nature) of + * elements in the vector. Consider it as a buffer for internal bit vector + * data structure. + * + * @return buffer + */ + @Override + public ArrowBuf getValidityBuffer() { + return validityBuffer; + } + + /** + * Get the buffer that stores the data for elements in the vector. + * + * @return buffer + */ + @Override + public ArrowBuf getDataBuffer() { + return valueBuffer; + } + + /** + * buffer that stores the offsets for elements + * in the vector. This operation is not supported for fixed-width vectors. + * + * @return buffer + */ + @Override + public ArrowBuf getOffsetBuffer() { + return offsetBuffer; + } + + /** + * Get the memory address of buffer that stores the offsets for elements + * in the vector. + * + * @return starting address of the buffer + */ + @Override + public long getOffsetBufferAddress() { + return offsetBuffer.memoryAddress(); + } + + /** + * Get the memory address of buffer that manages the validity + * (NULL or NON-NULL nature) of elements in the vector. + * + * @return starting address of the buffer + */ + @Override + public long getValidityBufferAddress() { + return validityBuffer.memoryAddress(); + } + + /** + * Get the memory address of buffer that stores the data for elements + * in the vector. + * + * @return starting address of the buffer + */ + @Override + public long getDataBufferAddress() { + return valueBuffer.memoryAddress(); + } + + /** + * Sets the desired value capacity for the vector. This function doesn't + * allocate any memory for the vector. + * + * @param valueCount desired number of elements in the vector + */ + @Override + public void setInitialCapacity(int valueCount) { + final long size = (long) valueCount * DEFAULT_RECORD_BYTE_COUNT; + checkDataBufferSize(size); + computeAndCheckOffsetsBufferSize(valueCount); + lastValueAllocationSizeInBytes = (int) size; + lastValueCapacity = valueCount; + } + + /** + * Sets the desired value capacity for the vector. This function doesn't + * allocate any memory for the vector. + * + * @param valueCount desired number of elements in the vector + * @param density average number of bytes per variable width element + */ + @Override + public void setInitialCapacity(int valueCount, double density) { Review Comment: @lidavidm Thinking about this again, I feel like this method is not practical for views. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
