Repository: arrow Updated Branches: refs/heads/master 3b946b822 -> 33c731dbd
ARROW-398: Java file format requires bitmaps of all 1's to be written⦠⦠when there are no nulls Author: Julien Le Dem <jul...@dremio.com> Closes #222 from julienledem/empty_buf and squashes the following commits: c29da53 [Julien Le Dem] fix extraneous bits 4e87d88 [Julien Le Dem] ARROW-398: Java file format requires bitmaps of all 1's to be written when there are no nulls Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/33c731db Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/33c731db Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/33c731db Branch: refs/heads/master Commit: 33c731dbd69331b0d7ce0dc924791db4ca461009 Parents: 3b946b8 Author: Julien Le Dem <jul...@dremio.com> Authored: Fri Dec 2 10:32:36 2016 -0500 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Fri Dec 2 10:32:36 2016 -0500 ---------------------------------------------------------------------- .../codegen/templates/NullableValueVectors.java | 2 +- .../src/main/codegen/templates/UnionVector.java | 2 +- .../arrow/vector/BaseDataValueVector.java | 7 +- .../java/org/apache/arrow/vector/BitVector.java | 36 ++++++++++ .../org/apache/arrow/vector/BufferBacked.java | 4 +- .../org/apache/arrow/vector/ValueVector.java | 17 ----- .../org/apache/arrow/vector/VectorLoader.java | 1 - .../apache/arrow/vector/complex/ListVector.java | 2 +- .../arrow/vector/complex/NullableMapVector.java | 2 +- .../arrow/vector/TestVectorUnloadLoad.java | 69 ++++++++++++++++++++ .../vector/file/TestArrowReaderWriter.java | 4 -- 11 files changed, 116 insertions(+), 30 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/33c731db/java/vector/src/main/codegen/templates/NullableValueVectors.java ---------------------------------------------------------------------- diff --git a/java/vector/src/main/codegen/templates/NullableValueVectors.java b/java/vector/src/main/codegen/templates/NullableValueVectors.java index 48af7a2..716fedc 100644 --- a/java/vector/src/main/codegen/templates/NullableValueVectors.java +++ b/java/vector/src/main/codegen/templates/NullableValueVectors.java @@ -144,7 +144,7 @@ public final class ${className} extends BaseDataValueVector implements <#if type @Override public void loadFieldBuffers(ArrowFieldNode fieldNode, List<ArrowBuf> ownBuffers) { - org.apache.arrow.vector.BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers); + org.apache.arrow.vector.BaseDataValueVector.load(fieldNode, getFieldInnerVectors(), ownBuffers); bits.valueCount = fieldNode.getLength(); } http://git-wip-us.apache.org/repos/asf/arrow/blob/33c731db/java/vector/src/main/codegen/templates/UnionVector.java ---------------------------------------------------------------------- diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java index 5ca3f90..9608b3c 100644 --- a/java/vector/src/main/codegen/templates/UnionVector.java +++ b/java/vector/src/main/codegen/templates/UnionVector.java @@ -105,7 +105,7 @@ public class UnionVector implements FieldVector { @Override public void loadFieldBuffers(ArrowFieldNode fieldNode, List<ArrowBuf> ownBuffers) { - BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers); + BaseDataValueVector.load(fieldNode, getFieldInnerVectors(), ownBuffers); this.valueCount = fieldNode.getLength(); } http://git-wip-us.apache.org/repos/asf/arrow/blob/33c731db/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java ---------------------------------------------------------------------- diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java index c22258d..4c6d363 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java @@ -21,6 +21,7 @@ import java.util.ArrayList; import java.util.List; import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.schema.ArrowFieldNode; import io.netty.buffer.ArrowBuf; @@ -29,13 +30,13 @@ public abstract class BaseDataValueVector extends BaseValueVector implements Buf protected final static byte[] emptyByteArray = new byte[]{}; // Nullable vectors use this - public static void load(List<BufferBacked> vectors, List<ArrowBuf> buffers) { + public static void load(ArrowFieldNode fieldNode, List<BufferBacked> vectors, List<ArrowBuf> buffers) { int expectedSize = vectors.size(); if (buffers.size() != expectedSize) { throw new IllegalArgumentException("Illegal buffer count, expected " + expectedSize + ", got: " + buffers.size()); } for (int i = 0; i < expectedSize; i++) { - vectors.get(i).load(buffers.get(i)); + vectors.get(i).load(fieldNode, buffers.get(i)); } } @@ -106,7 +107,7 @@ public abstract class BaseDataValueVector extends BaseValueVector implements Buf } @Override - public void load(ArrowBuf data) { + public void load(ArrowFieldNode fieldNode, ArrowBuf data) { this.data.release(); this.data = data.retain(allocator); } http://git-wip-us.apache.org/repos/asf/arrow/blob/33c731db/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java ---------------------------------------------------------------------- diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java index c12db50..7ce1236 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java @@ -22,6 +22,7 @@ import org.apache.arrow.memory.OutOfMemoryException; import org.apache.arrow.vector.complex.reader.FieldReader; import org.apache.arrow.vector.holders.BitHolder; import org.apache.arrow.vector.holders.NullableBitHolder; +import org.apache.arrow.vector.schema.ArrowFieldNode; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.util.OversizedAllocationException; @@ -49,6 +50,41 @@ public final class BitVector extends BaseDataValueVector implements FixedWidthVe } @Override + public void load(ArrowFieldNode fieldNode, ArrowBuf data) { + // When the vector is all nulls or all defined, the content of the buffer can be omitted + if (data.readableBytes() == 0 && fieldNode.getLength() != 0) { + data.release(); + int count = fieldNode.getLength(); + allocateNew(count); + int n = getSizeFromCount(count); + if (fieldNode.getNullCount() == 0) { + // all defined + // create an all 1s buffer + // set full bytes + int fullBytesCount = count / 8; + for (int i = 0; i < fullBytesCount; ++i) { + this.data.setByte(i, 0xFF); + } + int remainder = count % 8; + // set remaining bits + if (remainder > 0) { + byte bitMask = (byte) (0xFFL >>> ((8 - remainder) & 7));; + this.data.setByte(fullBytesCount, bitMask); + } + } else if (fieldNode.getNullCount() == fieldNode.getLength()) { + // all null + // create an all 0s buffer + zeroVector(); + } else { + throw new IllegalArgumentException("The buffer can be empty only if there's no data or it's all null or all defined"); + } + this.data.writerIndex(n); + } else { + super.load(fieldNode, data); + } + } + + @Override public Field getField() { throw new UnsupportedOperationException("internal vector"); } http://git-wip-us.apache.org/repos/asf/arrow/blob/33c731db/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java ---------------------------------------------------------------------- diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java b/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java index d1c262d..3c8b321 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java @@ -17,6 +17,8 @@ */ package org.apache.arrow.vector; +import org.apache.arrow.vector.schema.ArrowFieldNode; + import io.netty.buffer.ArrowBuf; /** @@ -24,7 +26,7 @@ import io.netty.buffer.ArrowBuf; */ public interface BufferBacked { - void load(ArrowBuf data); + void load(ArrowFieldNode fieldNode, ArrowBuf data); ArrowBuf unLoad(); http://git-wip-us.apache.org/repos/asf/arrow/blob/33c731db/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java ---------------------------------------------------------------------- diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java index ba7790e..5b24a41 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java @@ -131,13 +131,6 @@ public interface ValueVector extends Closeable, Iterable<ValueVector> { FieldReader getReader(); /** - * Get the metadata for this field. Used in serialization - * - * @return FieldMetadata for this field. - */ -// SerializedField getMetadata(); - - /** * Returns the number of bytes that is used by this vector instance. */ int getBufferSize(); @@ -167,16 +160,6 @@ public interface ValueVector extends Closeable, Iterable<ValueVector> { ArrowBuf[] getBuffers(boolean clear); /** - * Load the data provided in the buffer. Typically used when deserializing from the wire. - * - * @param metadata - * Metadata used to decode the incoming buffer. - * @param buffer - * The buffer that contains the ValueVector. - */ -// void load(SerializedField metadata, DrillBuf buffer); - - /** * An abstraction that is used to read from this vector instance. */ interface Accessor { http://git-wip-us.apache.org/repos/asf/arrow/blob/33c731db/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java ---------------------------------------------------------------------- diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java index c5d642e..757f061 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java @@ -81,7 +81,6 @@ public class VectorLoader { try { vector.loadFieldBuffers(fieldNode, ownBuffers); } catch (RuntimeException e) { - e.printStackTrace(); throw new IllegalArgumentException("Could not load buffers for field " + field + " error message" + e.getMessage(), e); } http://git-wip-us.apache.org/repos/asf/arrow/blob/33c731db/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java ---------------------------------------------------------------------- diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index dd99c73..e18f99f 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -93,7 +93,7 @@ public class ListVector extends BaseRepeatedValueVector implements FieldVector { @Override public void loadFieldBuffers(ArrowFieldNode fieldNode, List<ArrowBuf> ownBuffers) { - BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers); + BaseDataValueVector.load(fieldNode, getFieldInnerVectors(), ownBuffers); } @Override http://git-wip-us.apache.org/repos/asf/arrow/blob/33c731db/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java ---------------------------------------------------------------------- diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java index 8e1bbfa..f0ddf27 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java @@ -62,7 +62,7 @@ public class NullableMapVector extends MapVector implements FieldVector { @Override public void loadFieldBuffers(ArrowFieldNode fieldNode, List<ArrowBuf> ownBuffers) { - BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers); + BaseDataValueVector.load(fieldNode, getFieldInnerVectors(), ownBuffers); this.valueCount = fieldNode.getLength(); } http://git-wip-us.apache.org/repos/asf/arrow/blob/33c731db/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java ---------------------------------------------------------------------- diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java index 78f69ee..9dfe8d8 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java @@ -17,7 +17,13 @@ */ package org.apache.arrow.vector; +import static java.util.Arrays.asList; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + import java.io.IOException; +import java.util.Collections; import java.util.List; import org.apache.arrow.memory.BufferAllocator; @@ -29,12 +35,17 @@ import org.apache.arrow.vector.complex.writer.BaseWriter.ComplexWriter; import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; import org.apache.arrow.vector.complex.writer.BigIntWriter; import org.apache.arrow.vector.complex.writer.IntWriter; +import org.apache.arrow.vector.schema.ArrowFieldNode; import org.apache.arrow.vector.schema.ArrowRecordBatch; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.Schema; import org.junit.AfterClass; import org.junit.Assert; import org.junit.Test; +import io.netty.buffer.ArrowBuf; + public class TestVectorUnloadLoad { static final BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); @@ -88,6 +99,64 @@ public class TestVectorUnloadLoad { } } + /** + * The validity buffer can be empty if: + * - all values are defined + * - all values are null + * @throws IOException + */ + @Test + public void testLoadEmptyValidityBuffer() throws IOException { + Schema schema = new Schema(asList( + new Field("intDefined", true, new ArrowType.Int(32, true), Collections.<Field>emptyList()), + new Field("intNull", true, new ArrowType.Int(32, true), Collections.<Field>emptyList()) + )); + int count = 10; + ArrowBuf validity = allocator.getEmpty(); + ArrowBuf values = allocator.buffer(count * 4); // integers + for (int i = 0; i < count; i++) { + values.setInt(i * 4, i); + } + try ( + ArrowRecordBatch recordBatch = new ArrowRecordBatch(count, asList(new ArrowFieldNode(count, 0), new ArrowFieldNode(count, count)), asList(validity, values, validity, values)); + BufferAllocator finalVectorsAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE); + VectorSchemaRoot newRoot = new VectorSchemaRoot(schema, finalVectorsAllocator); + ) { + + // load it + VectorLoader vectorLoader = new VectorLoader(newRoot); + + vectorLoader.load(recordBatch); + + NullableIntVector intDefinedVector = (NullableIntVector)newRoot.getVector("intDefined"); + NullableIntVector intNullVector = (NullableIntVector)newRoot.getVector("intNull"); + for (int i = 0; i < count; i++) { + assertFalse("#" + i, intDefinedVector.getAccessor().isNull(i)); + assertEquals("#" + i, i, intDefinedVector.getAccessor().get(i)); + assertTrue("#" + i, intNullVector.getAccessor().isNull(i)); + } + intDefinedVector.getMutator().setSafe(count + 10, 1234); + assertTrue(intDefinedVector.getAccessor().isNull(count + 1)); + // empty slots should still default to unset + intDefinedVector.getMutator().setSafe(count + 1, 789); + assertFalse(intDefinedVector.getAccessor().isNull(count + 1)); + assertEquals(789, intDefinedVector.getAccessor().get(count + 1)); + assertTrue(intDefinedVector.getAccessor().isNull(count)); + assertTrue(intDefinedVector.getAccessor().isNull(count + 2)); + assertTrue(intDefinedVector.getAccessor().isNull(count + 3)); + assertTrue(intDefinedVector.getAccessor().isNull(count + 4)); + assertTrue(intDefinedVector.getAccessor().isNull(count + 5)); + assertTrue(intDefinedVector.getAccessor().isNull(count + 6)); + assertTrue(intDefinedVector.getAccessor().isNull(count + 7)); + assertTrue(intDefinedVector.getAccessor().isNull(count + 8)); + assertTrue(intDefinedVector.getAccessor().isNull(count + 9)); + assertFalse(intDefinedVector.getAccessor().isNull(count + 10)); + assertEquals(1234, intDefinedVector.getAccessor().get(count + 10)); + } finally { + values.release(); + } + } + public static VectorUnloader newVectorUnloader(FieldVector root) { Schema schema = new Schema(root.getField().getChildren()); int valueCount = root.getAccessor().getValueCount(); http://git-wip-us.apache.org/repos/asf/arrow/blob/33c731db/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowReaderWriter.java ---------------------------------------------------------------------- diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowReaderWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowReaderWriter.java index f90329a..8ed89fa 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowReaderWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowReaderWriter.java @@ -30,10 +30,6 @@ import java.util.List; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; -import org.apache.arrow.vector.file.ArrowBlock; -import org.apache.arrow.vector.file.ArrowFooter; -import org.apache.arrow.vector.file.ArrowReader; -import org.apache.arrow.vector.file.ArrowWriter; import org.apache.arrow.vector.schema.ArrowFieldNode; import org.apache.arrow.vector.schema.ArrowRecordBatch; import org.apache.arrow.vector.types.pojo.ArrowType;