vibhatha commented on code in PR #38423: URL: https://github.com/apache/arrow/pull/38423#discussion_r1424816799
########## java/vector/src/main/java/org/apache/arrow/vector/dictionary/BatchedDictionary.java: ########## @@ -0,0 +1,312 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.dictionary; + +import java.io.Closeable; +import java.io.IOException; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.util.hash.MurmurHasher; +import org.apache.arrow.util.VisibleForTesting; +import org.apache.arrow.vector.BaseIntVector; +import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.DictionaryEncoding; +import org.apache.arrow.vector.types.pojo.FieldType; + +/** + * A dictionary implementation for continuous encoding of data in a dictionary and + * index vector as opposed to the {@link Dictionary} that encodes a complete vector. + * Supports delta or replacement encoding. + */ +public class BatchedDictionary implements Closeable, BaseDictionary { + + private final DictionaryEncoding encoding; + + private final BaseVariableWidthVector dictionary; + + private final BaseIntVector indexVector; + + private final DictionaryHashTable hashTable; + + private int deltaIndex; + + private int dictionaryIndex; + + private boolean oneTimeEncoding; + + private boolean wasReset; + + private boolean vectorsProvided; + + /** + * Creates a dictionary and index vector with the respective types. The dictionary vector + * will be named "{name}-dictionary". + * <p> + * To use this dictionary, provide the dictionary vector to a {@link DictionaryProvider}, + * add the {@link #getIndexVector()} to the {@link org.apache.arrow.vector.VectorSchemaRoot} + * and call the {@link #setSafe(int, byte[], int, int)} or other set methods. + * + * @param name A name for the vector and dictionary. + * @param encoding The dictionary encoding to use. + * @param dictionaryType The type of the dictionary data. + * @param indexType The type of the encoded dictionary index. + * @param allocator The allocator to use. + */ + public BatchedDictionary( + String name, + DictionaryEncoding encoding, + ArrowType dictionaryType, + ArrowType indexType, + BufferAllocator allocator + ) { + this(name, encoding, dictionaryType, indexType, allocator, "-dictionary"); + } + + /** + * Creates a dictionary index vector with the respective types. + * + * @param name A name for the vector and dictionary. + * @param encoding The dictionary encoding to use. + * @param dictionaryType The type of the dictionary data. + * @param indexType The type of the encoded dictionary index. + * @param allocator The allocator to use. + * @param suffix A non-null suffix to append to the name of the dictionary. + */ + public BatchedDictionary( + String name, + DictionaryEncoding encoding, + ArrowType dictionaryType, + ArrowType indexType, + BufferAllocator allocator, + String suffix + ) { + this(name, encoding, dictionaryType, indexType, allocator, suffix, false); + } + + /** + * Creates a dictionary index vector with the respective types. + * + * @param name A name for the vector and dictionary. + * @param encoding The dictionary encoding to use. + * @param dictionaryType The type of the dictionary data. + * @param indexType The type of the encoded dictionary index. + * @param allocator The allocator to use. + * @param suffix A non-null suffix to append to the name of the dictionary. + * @param oneTimeEncoding A mode where the entries can be added to the dictionary until + * the first stream batch is written. After that, any new entries + * to the dictionary will throw an exception. + */ + public BatchedDictionary( + String name, + DictionaryEncoding encoding, + ArrowType dictionaryType, + ArrowType indexType, + BufferAllocator allocator, + String suffix, + boolean oneTimeEncoding + ) { + this.encoding = encoding; + this.oneTimeEncoding = oneTimeEncoding; + if (dictionaryType.getTypeID() != ArrowType.ArrowTypeID.Utf8 && + dictionaryType.getTypeID() != ArrowType.ArrowTypeID.Binary) { Review Comment: Can the `dictionaryType.getTypeID()` be `Utf8` and `Binary` at the same time? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
