Repository: drill
Updated Branches:
  refs/heads/master eb0c40306 -> 40de8ca4f


http://git-wip-us.apache.org/repos/asf/drill/blob/40de8ca4/exec/vector/src/main/java/org/apache/drill/exec/vector/accessor/writer/dummy/package-info.java
----------------------------------------------------------------------
diff --git 
a/exec/vector/src/main/java/org/apache/drill/exec/vector/accessor/writer/dummy/package-info.java
 
b/exec/vector/src/main/java/org/apache/drill/exec/vector/accessor/writer/dummy/package-info.java
new file mode 100644
index 0000000..9bc654b
--- /dev/null
+++ 
b/exec/vector/src/main/java/org/apache/drill/exec/vector/accessor/writer/dummy/package-info.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * This package provides a "dummy" set of writers. The dummy writers provide
+ * the same API as the "real" writers, but the dummy writers simply discard
+ * their data. The dummy writers are used when implementing projection:
+ * non-projected columns may still have to be processed (as in a CSV file,
+ * say), but their values are not needed. One way to do this is to do an
+ * if-statement for each value:<pre><code>
+ * if (column-a-is-projected) {
+ *   aWriter.setSomething(value);
+ * }</code></pre>
+ * The dummy writers convert the if-statement into a virtual function call,
+ * same as is done to handle the type-specific nature of vectors:
+ * <pre><code>
+ * aWriter.setSomething(value);
+ * </code></pre>
+ * <p>
+ * The theory is that the virtual function dispatch is simpler, and faster,
+ * than doing continual if-checks everywhere in the code.
+ * <p>
+ * The dummy writers reside in this package so that the various factory
+ * methods can automatically build the dummy versions when given a null
+ * value vector (which we then interpret to mean that there is no physical
+ * backing to the column.)
+ * <p>
+ * At present, most methods that return a value simply return zero or
+ * null.
+ * Experience will show whether it is worthwhile implementing some
+ * basics, such as a value type or index. For now, these return null,
+ * assuming that the caller won't do anything with the column other
+ * than set a value.
+ * <p>
+ * Some simpler dummy writers appear as nested classes inside the
+ * "real" writers.
+ */
+
+package org.apache.drill.exec.vector.accessor.writer.dummy;

http://git-wip-us.apache.org/repos/asf/drill/blob/40de8ca4/exec/vector/src/main/java/org/apache/drill/exec/vector/accessor/writer/package-info.java
----------------------------------------------------------------------
diff --git 
a/exec/vector/src/main/java/org/apache/drill/exec/vector/accessor/writer/package-info.java
 
b/exec/vector/src/main/java/org/apache/drill/exec/vector/accessor/writer/package-info.java
new file mode 100644
index 0000000..f536c09
--- /dev/null
+++ 
b/exec/vector/src/main/java/org/apache/drill/exec/vector/accessor/writer/package-info.java
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Implementation of the vector writers. The code will make much more sense if
+ * we start with a review of Drill’s complex vector data model. Drill has 38+
+ * data (“minor”) types. Drill also has three cardinalities (“modes”). 
The
+ * result is over 120+ different vector types. Then, when you add maps, 
repeated
+ * maps, lists and repeated lists, you rapidly get an explosion of types that
+ * the writer code must handle.
+ *
+ * <h4>Understanding the Vector Model</h4>
+ *
+ * Vectors can be categorized along multiple dimensions:
+ * <ul>
+ * <li>By data (minor) type</li>
+ * <li>By cardinality (mode)</li>
+ * <li>By fixed or variable width</li>
+ * <li>By repeat levels</li>
+ * </ul>
+ * <p>
+ * A repeated map, a list, a repeated list and any array (repeated) scalar all
+ * are array-like. Nullable and required modes are identical (single values),
+ * but a nullable has an additional is-set (“bit”) vector.
+ * <p>
+ * The writers (and readers) borrow concepts from JSON and relational theory
+ * to simplify the problem:
+ * <p>
+ * <ul>
+ * <li>Both the top-level row, and a Drill map are “tuples” and are treated
+ * similarly in the model.</li>
+ * <li>All non-map, non-list (that is, scalar) data types are treated
+ * uniformly.</li>
+ * <li>All arrays (whether a list, a repeated list, a repeated map, or a
+ * repeated scalar) are treated uniformly.</li>
+ * </ul>
+ *
+ * <h4>Repeat Levels</h4>
+ *
+ * JSON and Parquet can be understood as a series of one or more "repeat
+ * levels." First, let's identify the repeat levels above the batch
+ * level:
+ * <ul>
+ * <li>The top-most level is the "result set": the entire collection of
+ * rows that come from a file (or other data source.)</li>
+ * <li>Result sets are divided into batches: collections of up to 64K
+ * rows.</li>
+ * </ul>
+ *
+ * Then, within a batch:
+ * <ul>
+ * <li>Each batch is a collection or rows. A batch-level index points
+ * to the current row.</li>
+ * </ul>Scalar arrays introduce a repeat level: each row has 0, 1 or
+ * many elements in the array-valued column. An offset vector indexes
+ * to the first value for each row. Each scalar array has its own
+ * per-array index to point to the next write position.</li>
+ * <li>Map arrays introduce a repeat level for a group of columns
+ * (those that make up the map.) A single offset vector points to
+ * the common start position for the columns. A common index points
+ * to the common next write position.<li>
+ * <li>Lists also introduce a repeat level. (Details to be worked
+ * out.</li>
+ * </ul>
+ *
+ * For repeated vectors, one can think of the structure either top-down
+ * or bottom-up:
+ * <ul>
+ * <li>Top down: the row position points into an offset vector. The
+ * offset vector value points to either the data value, or into another
+ * offset vector.</li>
+ * <li>Bottom-up: values are appended to the end of the vector. Values
+ * are "pinched off" to form an array (for repeated maps) or for a row.
+ * In this view, indexes bubble upward. The inner-most last write position
+ * is written as the array end position in the enclosing offset vector.
+ * This may occur up several levels.</li>
+ * </ul>
+ *
+ * <h4>Writer Data Model</h4>
+ *
+ * The above leads to a very simple, JSON-like data model:
+ * <ul>
+ * <li>A tuple reader or writer models a row. (Usually via a subclass.) Column
+ * are accessible by name or position.</li>
+ * <li>Every column is modeled as an object.</li>
+ * <li>The object can have an object type: scalar, tuple or array.</li>
+ * <li>An array has a single element type (but many run-time elements)</li>
+ * <li>A scalar can be nullable or not, and provides a uniform get/set
+ * interface.</li>
+ * </ul>
+ * <p>
+ * This data model is similar to; but has important differences from, the 
prior,
+ * generated, readers and writers.
+ * <p>
+ * The object layer is new: it is the simplest way to model the three “object
+ * types.” An app using this code would use just the leaf scalar readers and
+ * writers.
+ *
+ * <h4>Writer Performance</h4>
+ *
+ * To maximize performance, have a single version for all "data modes":
+ * (nullable, required, repeated). Some items of note:
+ * <ul>
+ * <li>The writers bypass DrillBuf and the UDLE to needed writes to direct
+ * memory.</li>
+ * <li>The writers buffer the buffer address and implement a number of methods
+ * to synchronize that address when the buffer changes (on a new batch or 
during
+ * vector resize).</li>
+ * <li>Writing require a single bounds check. In most cases, the write is 
within
+ * bounds so the single check is all that is needed.</li>
+ * <li>If the write is out of bounds, then the writer determines the new vector
+ * size and performs the needed reallocation. To avoid multiple doublings, the
+ * writer computes the needed new size and allocates that size directly.</li>
+ * <li>Vector reallocation is improved to eliminate zeroing the new half of the
+ * buffer, data is left “garbage-filled.”</li>
+ * <li>If the vector would grow beyond 16 MB, then overflow is triggered, via a
+ * listener, which causes the buffer to be replaced. The write then
+ * continues.</li>
+ * <li>Offset vector updates are integrated into the writers using an
+ * `OffsetVectorWriter`. This writer caches the last write position so that 
each
+ * array write needs a single offset update, rather than the read and write as
+ * in previous code.</li>
+ * <li>The writers keep track of the “last write position” and perform
+ * “fill-empties” work if the new write position is more than one position
+ * behind the last write. All types now correctly support “fill-empties”
+ * (before, only nullable types did so reliably.)</li>
+ * <li>Null handling is done by an additional writer layer that wraps the
+ * underlying data writer. This avoids the need for a special nullable writer:
+ * the same nullable layer works for all data types.</li>
+ * <li>Array handling is done similarly: an array writer manages the offset
+ * vector and works the same for repeated scalars, repeated maps and
+ * (eventually) lists and repeated lists.</li>
+ * </ul>
+ */
+
+package org.apache.drill.exec.vector.accessor.writer;

http://git-wip-us.apache.org/repos/asf/drill/blob/40de8ca4/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/AbstractMapVector.java
----------------------------------------------------------------------
diff --git 
a/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/AbstractMapVector.java
 
b/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/AbstractMapVector.java
index 6b60471..5ac28c5 100644
--- 
a/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/AbstractMapVector.java
+++ 
b/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/AbstractMapVector.java
@@ -64,7 +64,6 @@ public abstract class AbstractMapVector extends 
AbstractContainerVector {
       valueVector.close();
     }
     vectors.clear();
-
     super.close();
   }
 
@@ -178,7 +177,7 @@ public abstract class AbstractMapVector extends 
AbstractContainerVector {
    *
    * Note that this method does not enforce any vector type check nor throws a 
schema change exception.
    */
-  protected void putChild(String name, ValueVector vector) {
+  public void putChild(String name, ValueVector vector) {
     putVector(name, vector);
     field.addChild(vector.getField());
   }
@@ -280,6 +279,16 @@ public abstract class AbstractMapVector extends 
AbstractContainerVector {
   }
 
   @Override
+  public int getAllocatedSize() {
+    int size = 0;
+
+    for (final ValueVector v : vectors.values()) {
+      size += v.getAllocatedSize();
+    }
+    return size;
+  }
+
+  @Override
   public void collectLedgers(Set<BufferLedger> ledgers) {
     for (final ValueVector v : vectors.values()) {
       v.collectLedgers(ledgers);

http://git-wip-us.apache.org/repos/asf/drill/blob/40de8ca4/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/BaseRepeatedValueVector.java
----------------------------------------------------------------------
diff --git 
a/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/BaseRepeatedValueVector.java
 
b/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/BaseRepeatedValueVector.java
index 2b41b8b..8472f80 100644
--- 
a/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/BaseRepeatedValueVector.java
+++ 
b/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/BaseRepeatedValueVector.java
@@ -87,14 +87,10 @@ public abstract class BaseRepeatedValueVector extends 
BaseValueVector implements
 
 
   @Override
-  public UInt4Vector getOffsetVector() {
-    return offsets;
-  }
+  public UInt4Vector getOffsetVector() { return offsets; }
 
   @Override
-  public ValueVector getDataVector() {
-    return vector;
-  }
+  public ValueVector getDataVector() { return vector; }
 
   @Override
   public void setInitialCapacity(int numRecords) {
@@ -127,6 +123,11 @@ public abstract class BaseRepeatedValueVector extends 
BaseValueVector implements
   }
 
   @Override
+  public int getAllocatedSize() {
+    return offsets.getAllocatedSize() + vector.getAllocatedSize();
+  }
+
+  @Override
   public int getBufferSizeFor(int valueCount) {
     if (valueCount == 0) {
       return 0;

http://git-wip-us.apache.org/repos/asf/drill/blob/40de8ca4/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/ListVector.java
----------------------------------------------------------------------
diff --git 
a/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/ListVector.java
 
b/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/ListVector.java
index 9569946..7de5ce6 100644
--- 
a/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/ListVector.java
+++ 
b/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/ListVector.java
@@ -52,7 +52,6 @@ public class ListVector extends BaseRepeatedValueVector {
   private Accessor accessor = new Accessor();
   private UnionListWriter writer;
   private UnionListReader reader;
-  private CallBack callBack;
 
   public ListVector(MaterializedField field, BufferAllocator allocator, 
CallBack callBack) {
     super(field, allocator);
@@ -61,7 +60,6 @@ public class ListVector extends BaseRepeatedValueVector {
     this.field.addChild(getDataVector().getField());
     this.writer = new UnionListWriter(this);
     this.reader = new UnionListReader(this);
-    this.callBack = callBack;
   }
 
   public UnionListWriter getWriter() {
@@ -203,6 +201,8 @@ public class ListVector extends BaseRepeatedValueVector {
             .addChild(bits.getMetadata())
             .addChild(vector.getMetadata());
   }
+
+  @Override
   public <T extends ValueVector> AddOrGetResult<T> 
addOrGetVector(VectorDescriptor descriptor) {
     AddOrGetResult<T> result = super.addOrGetVector(descriptor);
     reader = new UnionListReader(this);

http://git-wip-us.apache.org/repos/asf/drill/blob/40de8ca4/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/MapVector.java
----------------------------------------------------------------------
diff --git 
a/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/MapVector.java 
b/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/MapVector.java
index 19c910b..4a501b8 100644
--- 
a/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/MapVector.java
+++ 
b/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/MapVector.java
@@ -67,10 +67,7 @@ public class MapVector extends AbstractMapVector {
   }
 
   @Override
-  public FieldReader getReader() {
-    //return new SingleMapReaderImpl(MapVector.this);
-    return reader;
-  }
+  public FieldReader getReader() { return reader; }
 
   transient private MapTransferPair ephPair;
   transient private MapSingleCopier ephPair2;
@@ -95,9 +92,7 @@ public class MapVector extends AbstractMapVector {
   }
 
   @Override
-  protected boolean supportsDirectRead() {
-    return true;
-  }
+  protected boolean supportsDirectRead() { return true; }
 
   public Iterator<String> fieldNameIterator() {
     return getChildFieldNames().iterator();
@@ -124,6 +119,15 @@ public class MapVector extends AbstractMapVector {
   }
 
   @Override
+  public int getAllocatedSize() {
+    int size = 0;
+    for (final ValueVector v : this) {
+      size += v.getAllocatedSize();
+    }
+    return size;
+  }
+
+  @Override
   public int getBufferSizeFor(final int valueCount) {
     if (valueCount == 0) {
       return 0;
@@ -353,6 +357,10 @@ public class MapVector extends AbstractMapVector {
     return getChildByOrdinal(id);
   }
 
+  public void setMapValueCount(int valueCount) {
+    this.valueCount = valueCount;
+  }
+
   public class Mutator extends BaseValueVector.BaseMutator {
 
     @Override
@@ -360,7 +368,7 @@ public class MapVector extends AbstractMapVector {
       for (final ValueVector v : getChildren()) {
         v.getMutator().setValueCount(valueCount);
       }
-      MapVector.this.valueCount = valueCount;
+      setMapValueCount(valueCount);
     }
 
     @Override

http://git-wip-us.apache.org/repos/asf/drill/blob/40de8ca4/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/RepeatedListVector.java
----------------------------------------------------------------------
diff --git 
a/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/RepeatedListVector.java
 
b/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/RepeatedListVector.java
index be9ebee..6442417 100644
--- 
a/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/RepeatedListVector.java
+++ 
b/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/RepeatedListVector.java
@@ -198,14 +198,10 @@ public class RepeatedListVector extends 
AbstractContainerVector
     }
 
     @Override
-    public RepeatedListAccessor getAccessor() {
-      return accessor;
-    }
+    public RepeatedListAccessor getAccessor() { return accessor; }
 
     @Override
-    public RepeatedListMutator getMutator() {
-      return mutator;
-    }
+    public RepeatedListMutator getMutator() { return mutator; }
 
     @Override
     public FieldReader getReader() {
@@ -277,11 +273,8 @@ public class RepeatedListVector extends 
AbstractContainerVector
     }
   }
 
-
-    @Override
-  public RepeatedListReaderImpl getReader() {
-    return reader;
-  }
+  @Override
+  public RepeatedListReaderImpl getReader() { return reader; }
 
   @Override
   public DelegateRepeatedVector.RepeatedListAccessor getAccessor() {
@@ -334,6 +327,11 @@ public class RepeatedListVector extends 
AbstractContainerVector
   }
 
   @Override
+  public int getAllocatedSize() {
+    return delegate.getAllocatedSize();
+  }
+
+  @Override
   public int getBufferSizeFor(final int valueCount) {
     return delegate.getBufferSizeFor(valueCount);
   }

http://git-wip-us.apache.org/repos/asf/drill/blob/40de8ca4/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/RepeatedMapVector.java
----------------------------------------------------------------------
diff --git 
a/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/RepeatedMapVector.java
 
b/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/RepeatedMapVector.java
index 6b29258..57f1a67 100644
--- 
a/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/RepeatedMapVector.java
+++ 
b/exec/vector/src/main/java/org/apache/drill/exec/vector/complex/RepeatedMapVector.java
@@ -63,18 +63,22 @@ public class RepeatedMapVector extends AbstractMapVector
   private final Mutator mutator = new Mutator();
   private final EmptyValuePopulator emptyPopulator;
 
-  public RepeatedMapVector(MaterializedField field, BufferAllocator allocator, 
CallBack callBack){
+  public RepeatedMapVector(MaterializedField field, BufferAllocator allocator, 
CallBack callBack) {
     super(field, allocator, callBack);
     this.offsets = new UInt4Vector(BaseRepeatedValueVector.OFFSETS_FIELD, 
allocator);
     this.emptyPopulator = new EmptyValuePopulator(offsets);
   }
 
-  @Override
-  public UInt4Vector getOffsetVector() {
-    return offsets;
+  public RepeatedMapVector(MaterializedField field, UInt4Vector offsets, 
CallBack callBack) {
+    super(field, offsets.getAllocator(), callBack);
+    this.offsets = offsets;
+    this.emptyPopulator = new EmptyValuePopulator(offsets);
   }
 
   @Override
+  public UInt4Vector getOffsetVector() { return offsets; }
+
+  @Override
   public ValueVector getDataVector() {
     throw new UnsupportedOperationException();
   }
@@ -93,9 +97,7 @@ public class RepeatedMapVector extends AbstractMapVector
   }
 
   @Override
-  public RepeatedMapReaderImpl getReader() {
-    return reader;
-  }
+  public RepeatedMapReaderImpl getReader() { return reader; }
 
   @Override
   public void allocateNew(int groupCount, int innerValueCount) {
@@ -137,6 +139,11 @@ public class RepeatedMapVector extends AbstractMapVector
   }
 
   @Override
+  public int getAllocatedSize() {
+    return offsets.getAllocatedSize() + super.getAllocatedSize();
+  }
+
+  @Override
   public int getBufferSizeFor(final int valueCount) {
     if (valueCount == 0) {
       return 0;

Reply via email to