Github user Ben-Zvi commented on a diff in the pull request:

    https://github.com/apache/drill/pull/860#discussion_r123838906
  
    --- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/spill/RecordBatchSizer.java
 ---
    @@ -19,117 +19,162 @@
     
     import java.util.ArrayList;
     import java.util.List;
    +import java.util.Set;
     
    +import org.apache.drill.common.types.TypeProtos.DataMode;
     import org.apache.drill.common.types.TypeProtos.MinorType;
     import org.apache.drill.exec.expr.TypeHelper;
    +import org.apache.drill.exec.memory.AllocationManager.BufferLedger;
     import org.apache.drill.exec.memory.BaseAllocator;
     import org.apache.drill.exec.record.BatchSchema;
     import org.apache.drill.exec.record.MaterializedField;
     import org.apache.drill.exec.record.RecordBatch;
    +import org.apache.drill.exec.record.SmartAllocationHelper;
     import org.apache.drill.exec.record.VectorAccessible;
     import org.apache.drill.exec.record.VectorWrapper;
     import org.apache.drill.exec.record.selection.SelectionVector2;
    +import org.apache.drill.exec.vector.UInt4Vector;
     import org.apache.drill.exec.vector.ValueVector;
     import org.apache.drill.exec.vector.complex.AbstractMapVector;
    +import org.apache.drill.exec.vector.complex.RepeatedValueVector;
     import org.apache.drill.exec.vector.VariableWidthVector;
     
    +import com.google.common.collect.Sets;
    +
     /**
      * Given a record batch or vector container, determines the actual memory
      * consumed by each column, the average row, and the entire record batch.
      */
     
     public class RecordBatchSizer {
    -//  private static final org.slf4j.Logger logger = 
org.slf4j.LoggerFactory.getLogger(RecordBatchSizer.class);
     
       /**
        * Column size information.
        */
       public static class ColumnSize {
    +    public final String prefix;
         public final MaterializedField metadata;
     
         /**
    -     * Assumed size from Drill metadata.
    +     * Assumed size from Drill metadata. Note that this information is
    +     * 100% bogus. Do not use it.
          */
     
    +    @Deprecated
         public int stdSize;
     
         /**
    -     * Actual memory consumed by all the vectors associated with this 
column.
    -     */
    -
    -    public int totalSize;
    -
    -    /**
          * Actual average column width as determined from actual memory use. 
This
          * size is larger than the actual data size since this size includes 
per-
          * column overhead such as any unused vector space, etc.
          */
     
    -    public int estSize;
    -    public int capacity;
    -    public int density;
    -    public int dataSize;
    -    public boolean variableWidth;
    -
    -    public ColumnSize(ValueVector vv) {
    -      metadata = vv.getField();
    -      stdSize = TypeHelper.getSize(metadata.getType());
    -
    -      // Can't get size estimates if this is an empty batch.
    -      int rowCount = vv.getAccessor().getValueCount();
    -      if (rowCount == 0) {
    -        estSize = stdSize;
    -        return;
    -      }
    +    public final int estSize;
    +    public final int valueCount;
    +    public final int entryCount;
    +    public final int dataSize;
    +    public final int estElementCount;
    +    public final boolean isVariableWidth;
     
    -      // Total size taken by all vectors (and underlying buffers)
    -      // associated with this vector.
    +    public ColumnSize(ValueVector v, String prefix, int valueCount) {
    +      this.prefix = prefix;
    +      this.valueCount = valueCount;
    +      metadata = v.getField();
    +      boolean isMap = v.getField().getType().getMinorType() == 
MinorType.MAP;
     
    -      totalSize = vv.getAllocatedByteCount();
    +      // The amount of memory consumed by the payload: the actual
    +      // data stored in the vectors.
     
    -      // Capacity is the number of values that the vector could
    -      // contain. This is useful only for fixed-length vectors.
    +      int mapSize = 0;
    +      if (v.getField().getDataMode() == DataMode.REPEATED) {
     
    -      capacity = vv.getValueCapacity();
    +        // Repeated vectors are special: they have an associated offset 
vector
    +        // that changes the value count of the contained vectors.
     
    -      // The amount of memory consumed by the payload: the actual
    -      // data stored in the vectors.
    +        @SuppressWarnings("resource")
    +        UInt4Vector offsetVector = ((RepeatedValueVector) 
v).getOffsetVector();
    +        entryCount = offsetVector.getAccessor().get(valueCount);
    +        estElementCount = roundUp(entryCount, valueCount);
    +        if (isMap) {
     
    -      dataSize = vv.getPayloadByteCount();
    +          // For map, the only data associated with the map vector
    +          // itself is the offset vector, if any.
     
    -      // Determine "density" the number of rows compared to potential
    -      // capacity. Low-density batches occur at block boundaries, ends
    -      // of files and so on. Low-density batches throw off our estimates
    -      // for Varchar columns because we don't know the actual number of
    -      // bytes consumed (that information is hidden behind the Varchar
    -      // implementation where we can't get at it.)
    +          mapSize = offsetVector.getPayloadByteCount(valueCount);
    +        }
    +      } else {
    +        entryCount = valueCount;
    +        estElementCount = 1;
    +      }
     
    -      density = roundUp(dataSize * 100, totalSize);
    -      estSize = roundUp(dataSize, rowCount);
    -      variableWidth = vv instanceof VariableWidthVector ;
    +      if (isMap) {
    +        dataSize = mapSize;
    +        stdSize = 0;
    +     } else {
    +        dataSize = v.getPayloadByteCount(valueCount);
    +        stdSize = TypeHelper.getSize(metadata.getType());
    +     }
    +      estSize = roundUp(dataSize, valueCount);
    +      isVariableWidth = v instanceof VariableWidthVector ;
         }
     
         @Override
         public String toString() {
           StringBuilder buf = new StringBuilder()
    +          .append(prefix)
               .append(metadata.getName())
               .append("(type: ")
    +          .append(metadata.getType().getMode().name())
    +          .append(" ")
               .append(metadata.getType().getMinorType().name())
    -          .append(", std col. size: ")
    +          .append(", count: ")
    +          .append(valueCount);
    +      if (metadata.getDataMode() == DataMode.REPEATED) {
    +        buf.append(", total entries: ")
    +           .append(entryCount)
    +           .append(", per-array: ")
    +           .append(estElementCount);
    +      }
    +      buf .append(", std size: ")
               .append(stdSize)
    -          .append(", actual col. size: ")
    +          .append(", actual size: ")
               .append(estSize)
    -          .append(", total size: ")
    -          .append(totalSize)
               .append(", data size: ")
               .append(dataSize)
    -          .append(", row capacity: ")
    -          .append(capacity)
    -          .append(", density: ")
    -          .append(density)
               .append(")");
           return buf.toString();
         }
    +
    +    public void buildAllocHelper(SmartAllocationHelper allocHelper) {
    +      int width = 0;
    +      switch(metadata.getType().getMinorType()) {
    +      case VAR16CHAR:
    +      case VARBINARY:
    +      case VARCHAR:
    +
    +        // Subtract out the offset vector width
    +        width = estSize - 4;
    +
    +        // Subtract out the bits (is-set) vector width
    +        if (metadata.getDataMode() == DataMode.OPTIONAL) {
    +          width -= 1;
    +        }
    +        break;
    +      default:
    +        break;
    +      }
    +      String name = prefix + metadata.getName();
    +      if (metadata.getDataMode() == DataMode.REPEATED) {
    +        if (width > 0) {
    +          allocHelper.variableWidthArray(name, width, estElementCount);
    --- End diff --
    
    (1) The methods variableWidthArray() and variableWidth() can be combined 
into one (as the latter is called when estElementCount is 1, which makes the 
two do the same).
    (2) Can move this call (combined with the one below - on line 175) into the 
new if statement above that handles variable subtypes.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

Reply via email to