Repository: drill Updated Branches: refs/heads/master eb0c40306 -> 40de8ca4f
http://git-wip-us.apache.org/repos/asf/drill/blob/40de8ca4/exec/vector/src/main/java/org/apache/drill/exec/vector/accessor/writer/dummy/package-info.java ---------------------------------------------------------------------- diff --git a/exec/vector/src/main/java/org/apache/drill/exec/vector/accessor/writer/dummy/package-info.java b/exec/vector/src/main/java/org/apache/drill/exec/vector/accessor/writer/dummy/package-info.java new file mode 100644 index 0000000..9bc654b --- /dev/null +++ b/exec/vector/src/main/java/org/apache/drill/exec/vector/accessor/writer/dummy/package-info.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * This package provides a "dummy" set of writers. The dummy writers provide + * the same API as the "real" writers, but the dummy writers simply discard + * their data. The dummy writers are used when implementing projection: + * non-projected columns may still have to be processed (as in a CSV file, + * say), but their values are not needed. One way to do this is to do an + * if-statement for each value:<pre><code> + * if (column-a-is-projected) { + * aWriter.setSomething(value); + * }</code></pre> + * The dummy writers convert the if-statement into a virtual function call, + * same as is done to handle the type-specific nature of vectors: + * <pre><code> + * aWriter.setSomething(value); + * </code></pre> + * <p> + * The theory is that the virtual function dispatch is simpler, and faster, + * than doing continual if-checks everywhere in the code. + * <p> + * The dummy writers reside in this package so that the various factory + * methods can automatically build the dummy versions when given a null + * value vector (which we then interpret to mean that there is no physical + * backing to the column.) + * <p> + * At present, most methods that return a value simply return zero or + * null. + * Experience will show whether it is worthwhile implementing some + * basics, such as a value type or index. For now, these return null, + * assuming that the caller won't do anything with the column other + * than set a value. + * <p> + * Some simpler dummy writers appear as nested classes inside the + * "real" writers. + */ + +package org.apache.drill.exec.vector.accessor.writer.dummy; http://git-wip-us.apache.org/repos/asf/drill/blob/40de8ca4/exec/vector/src/main/java/org/apache/drill/exec/vector/accessor/writer/package-info.java ---------------------------------------------------------------------- diff --git a/exec/vector/src/main/java/org/apache/drill/exec/vector/accessor/writer/package-info.java b/exec/vector/src/main/java/org/apache/drill/exec/vector/accessor/writer/package-info.java new file mode 100644 index 0000000..f536c09 --- /dev/null +++ b/exec/vector/src/main/java/org/apache/drill/exec/vector/accessor/writer/package-info.java @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Implementation of the vector writers. The code will make much more sense if + * we start with a review of Drillâs complex vector data model. Drill has 38+ + * data (âminorâ) types. Drill also has three cardinalities (âmodesâ). The + * result is over 120+ different vector types. Then, when you add maps, repeated + * maps, lists and repeated lists, you rapidly get an explosion of types that + * the writer code must handle. + * + * <h4>Understanding the Vector Model</h4> + * + * Vectors can be categorized along multiple dimensions: + * <ul> + * <li>By data (minor) type</li> + * <li>By cardinality (mode)</li> + * <li>By fixed or variable width</li> + * <li>By repeat levels</li> + * </ul> + * <p> + * A repeated map, a list, a repeated list and any array (repeated) scalar all + * are array-like. Nullable and required modes are identical (single values), + * but a nullable has an additional is-set (âbitâ) vector. + * <p> + * The writers (and readers) borrow concepts from JSON and relational theory + * to simplify the problem: + * <p> + * <ul> + * <li>Both the top-level row, and a Drill map are âtuplesâ and are treated + * similarly in the model.</li> + * <li>All non-map, non-list (that is, scalar) data types are treated + * uniformly.</li> + * <li>All arrays (whether a list, a repeated list, a repeated map, or a + * repeated scalar) are treated uniformly.</li> + * </ul> + * + * <h4>Repeat Levels</h4> + * + * JSON and Parquet can be understood as a series of one or more "repeat + * levels." First, let's identify the repeat levels above the batch + * level: + * <ul> + * <li>The top-most level is the "result set": the entire collection of + * rows that come from a file (or other data source.)</li> + * <li>Result sets are divided into batches: collections of up to 64K + * rows.</li> + * </ul> + * + * Then, within a batch: + * <ul> + * <li>Each batch is a collection or rows. A batch-level index points + * to the current row.</li> + * </ul>Scalar arrays introduce a repeat level: each row has 0, 1 or + * many elements in the array-valued column. An offset vector indexes + * to the first value for each row. Each scalar array has its own + * per-array index to point to the next write position.</li> + * <li>Map arrays introduce a repeat level for a group of columns + * (those that make up the map.) A single offset vector points to + * the common start position for the columns. A common index points + * to the common next write position.<li> + * <li>Lists also introduce a repeat level. (Details to be worked + * out.</li> + * </ul> + * + * For repeated vectors, one can think of the structure either top-down + * or bottom-up: + * <ul> + * <li>Top down: the row position points into an offset vector. The + * offset vector value points to either the data value, or into another + * offset vector.</li> + * <li>Bottom-up: values are appended to the end of the vector. Values + * are "pinched off" to form an array (for repeated maps) or for a row. + * In this view, indexes bubble upward. The inner-most last write position + * is written as the array end position in the enclosing offset vector. + * This may occur up several levels.</li> + * </ul> + * + * <h4>Writer Data Model</h4> + * + * The above leads to a very simple, JSON-like data model: + * <ul> + * <li>A tuple reader or writer models a row. (Usually via a subclass.) Column + * are accessible by name or position.</li> + * <li>Every column is modeled as an object.</li> + * <li>The object can have an object type: scalar, tuple or array.</li> + * <li>An array has a single element type (but many run-time elements)</li> + * <li>A scalar can be nullable or not, and provides a uniform get/set + * interface.</li> + * </ul> + * <p> + * This data model is similar to; but has important differences from, the prior, + * generated, readers and writers. + * <p> + * The object layer is new: it is the simplest way to model the three âobject + * types.â An app using this code would use just the leaf scalar readers and + * writers. + * + * <h4>Writer Performance</h4> + * + * To maximize performance, have a single version for all "data modes": + * (nullable, required, repeated). Some items of note: + * <ul> + * <li>The writers bypass DrillBuf and the UDLE to needed writes to direct + * memory.</li> + * <li>The writers buffer the buffer address and implement a number of methods + * to synchronize that address when the buffer changes (on a new batch or during + * vector resize).</li> + * <li>Writing require a single bounds check. In most cases, the write is within + * bounds so the single check is all that is needed.</li> + * <li>If the write is out of bounds, then the writer determines the new vector + * size and performs the needed reallocation. To avoid multiple doublings, the + * writer computes the needed new size and allocates that size directly.</li> + * <li>Vector reallocation is improved to eliminate zeroing the new half of the + * buffer, data is left âgarbage-filled.â</li> + * <li>If the vector would grow beyond 16 MB, then overflow is triggered, via a + * listener, which causes the buffer to be replaced. The write then + * continues.</li> + * <li>Offset vector updates are integrated into the writers using an + * `OffsetVectorWriter`. This writer caches the last write position so that each + * array write needs a single offset update, rather than the read and write as + * in previous code.</li> + * <li>The writers keep track of the âlast write positionâ and perform + * âfill-emptiesâ work if the new write position is more than one position + * behind the last write. All types now correctly support âfill-emptiesâ + * (before, only nullable types did so reliably.)</li> + * <li>Null handling is done by an additional writer layer that wraps the + * underlying data writer. This avoids the need for a special nullable writer: + * the same nullable layer works for all data types.</li> + * <li>Array handling is done similarly: an array writer manages the offset + * vector and works the same for repeated scalars, repeated maps and + * (eventually) lists and repeated lists.</li> + * </ul> + */ + +package org.apache.drill.exec.vector.accessor.writer; http://git-wip-us.apache.org/repos/asf/drill/blob/40de8ca4/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/AbstractMapVector.java ---------------------------------------------------------------------- diff --git a/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/AbstractMapVector.java b/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/AbstractMapVector.java index 6b60471..5ac28c5 100644 --- a/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/AbstractMapVector.java +++ b/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/AbstractMapVector.java @@ -64,7 +64,6 @@ public abstract class AbstractMapVector extends AbstractContainerVector { valueVector.close(); } vectors.clear(); - super.close(); } @@ -178,7 +177,7 @@ public abstract class AbstractMapVector extends AbstractContainerVector { * * Note that this method does not enforce any vector type check nor throws a schema change exception. */ - protected void putChild(String name, ValueVector vector) { + public void putChild(String name, ValueVector vector) { putVector(name, vector); field.addChild(vector.getField()); } @@ -280,6 +279,16 @@ public abstract class AbstractMapVector extends AbstractContainerVector { } @Override + public int getAllocatedSize() { + int size = 0; + + for (final ValueVector v : vectors.values()) { + size += v.getAllocatedSize(); + } + return size; + } + + @Override public void collectLedgers(Set<BufferLedger> ledgers) { for (final ValueVector v : vectors.values()) { v.collectLedgers(ledgers); http://git-wip-us.apache.org/repos/asf/drill/blob/40de8ca4/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/BaseRepeatedValueVector.java ---------------------------------------------------------------------- diff --git a/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/BaseRepeatedValueVector.java b/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/BaseRepeatedValueVector.java index 2b41b8b..8472f80 100644 --- a/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/BaseRepeatedValueVector.java +++ b/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/BaseRepeatedValueVector.java @@ -87,14 +87,10 @@ public abstract class BaseRepeatedValueVector extends BaseValueVector implements @Override - public UInt4Vector getOffsetVector() { - return offsets; - } + public UInt4Vector getOffsetVector() { return offsets; } @Override - public ValueVector getDataVector() { - return vector; - } + public ValueVector getDataVector() { return vector; } @Override public void setInitialCapacity(int numRecords) { @@ -127,6 +123,11 @@ public abstract class BaseRepeatedValueVector extends BaseValueVector implements } @Override + public int getAllocatedSize() { + return offsets.getAllocatedSize() + vector.getAllocatedSize(); + } + + @Override public int getBufferSizeFor(int valueCount) { if (valueCount == 0) { return 0; http://git-wip-us.apache.org/repos/asf/drill/blob/40de8ca4/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/ListVector.java ---------------------------------------------------------------------- diff --git a/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/ListVector.java b/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/ListVector.java index 9569946..7de5ce6 100644 --- a/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/ListVector.java +++ b/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/ListVector.java @@ -52,7 +52,6 @@ public class ListVector extends BaseRepeatedValueVector { private Accessor accessor = new Accessor(); private UnionListWriter writer; private UnionListReader reader; - private CallBack callBack; public ListVector(MaterializedField field, BufferAllocator allocator, CallBack callBack) { super(field, allocator); @@ -61,7 +60,6 @@ public class ListVector extends BaseRepeatedValueVector { this.field.addChild(getDataVector().getField()); this.writer = new UnionListWriter(this); this.reader = new UnionListReader(this); - this.callBack = callBack; } public UnionListWriter getWriter() { @@ -203,6 +201,8 @@ public class ListVector extends BaseRepeatedValueVector { .addChild(bits.getMetadata()) .addChild(vector.getMetadata()); } + + @Override public <T extends ValueVector> AddOrGetResult<T> addOrGetVector(VectorDescriptor descriptor) { AddOrGetResult<T> result = super.addOrGetVector(descriptor); reader = new UnionListReader(this); http://git-wip-us.apache.org/repos/asf/drill/blob/40de8ca4/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/MapVector.java ---------------------------------------------------------------------- diff --git a/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/MapVector.java b/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/MapVector.java index 19c910b..4a501b8 100644 --- a/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/MapVector.java +++ b/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/MapVector.java @@ -67,10 +67,7 @@ public class MapVector extends AbstractMapVector { } @Override - public FieldReader getReader() { - //return new SingleMapReaderImpl(MapVector.this); - return reader; - } + public FieldReader getReader() { return reader; } transient private MapTransferPair ephPair; transient private MapSingleCopier ephPair2; @@ -95,9 +92,7 @@ public class MapVector extends AbstractMapVector { } @Override - protected boolean supportsDirectRead() { - return true; - } + protected boolean supportsDirectRead() { return true; } public Iterator<String> fieldNameIterator() { return getChildFieldNames().iterator(); @@ -124,6 +119,15 @@ public class MapVector extends AbstractMapVector { } @Override + public int getAllocatedSize() { + int size = 0; + for (final ValueVector v : this) { + size += v.getAllocatedSize(); + } + return size; + } + + @Override public int getBufferSizeFor(final int valueCount) { if (valueCount == 0) { return 0; @@ -353,6 +357,10 @@ public class MapVector extends AbstractMapVector { return getChildByOrdinal(id); } + public void setMapValueCount(int valueCount) { + this.valueCount = valueCount; + } + public class Mutator extends BaseValueVector.BaseMutator { @Override @@ -360,7 +368,7 @@ public class MapVector extends AbstractMapVector { for (final ValueVector v : getChildren()) { v.getMutator().setValueCount(valueCount); } - MapVector.this.valueCount = valueCount; + setMapValueCount(valueCount); } @Override http://git-wip-us.apache.org/repos/asf/drill/blob/40de8ca4/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/RepeatedListVector.java ---------------------------------------------------------------------- diff --git a/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/RepeatedListVector.java b/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/RepeatedListVector.java index be9ebee..6442417 100644 --- a/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/RepeatedListVector.java +++ b/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/RepeatedListVector.java @@ -198,14 +198,10 @@ public class RepeatedListVector extends AbstractContainerVector } @Override - public RepeatedListAccessor getAccessor() { - return accessor; - } + public RepeatedListAccessor getAccessor() { return accessor; } @Override - public RepeatedListMutator getMutator() { - return mutator; - } + public RepeatedListMutator getMutator() { return mutator; } @Override public FieldReader getReader() { @@ -277,11 +273,8 @@ public class RepeatedListVector extends AbstractContainerVector } } - - @Override - public RepeatedListReaderImpl getReader() { - return reader; - } + @Override + public RepeatedListReaderImpl getReader() { return reader; } @Override public DelegateRepeatedVector.RepeatedListAccessor getAccessor() { @@ -334,6 +327,11 @@ public class RepeatedListVector extends AbstractContainerVector } @Override + public int getAllocatedSize() { + return delegate.getAllocatedSize(); + } + + @Override public int getBufferSizeFor(final int valueCount) { return delegate.getBufferSizeFor(valueCount); } http://git-wip-us.apache.org/repos/asf/drill/blob/40de8ca4/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/RepeatedMapVector.java ---------------------------------------------------------------------- diff --git a/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/RepeatedMapVector.java b/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/RepeatedMapVector.java index 6b29258..57f1a67 100644 --- a/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/RepeatedMapVector.java +++ b/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/RepeatedMapVector.java @@ -63,18 +63,22 @@ public class RepeatedMapVector extends AbstractMapVector private final Mutator mutator = new Mutator(); private final EmptyValuePopulator emptyPopulator; - public RepeatedMapVector(MaterializedField field, BufferAllocator allocator, CallBack callBack){ + public RepeatedMapVector(MaterializedField field, BufferAllocator allocator, CallBack callBack) { super(field, allocator, callBack); this.offsets = new UInt4Vector(BaseRepeatedValueVector.OFFSETS_FIELD, allocator); this.emptyPopulator = new EmptyValuePopulator(offsets); } - @Override - public UInt4Vector getOffsetVector() { - return offsets; + public RepeatedMapVector(MaterializedField field, UInt4Vector offsets, CallBack callBack) { + super(field, offsets.getAllocator(), callBack); + this.offsets = offsets; + this.emptyPopulator = new EmptyValuePopulator(offsets); } @Override + public UInt4Vector getOffsetVector() { return offsets; } + + @Override public ValueVector getDataVector() { throw new UnsupportedOperationException(); } @@ -93,9 +97,7 @@ public class RepeatedMapVector extends AbstractMapVector } @Override - public RepeatedMapReaderImpl getReader() { - return reader; - } + public RepeatedMapReaderImpl getReader() { return reader; } @Override public void allocateNew(int groupCount, int innerValueCount) { @@ -137,6 +139,11 @@ public class RepeatedMapVector extends AbstractMapVector } @Override + public int getAllocatedSize() { + return offsets.getAllocatedSize() + super.getAllocatedSize(); + } + + @Override public int getBufferSizeFor(final int valueCount) { if (valueCount == 0) { return 0;
