[
https://issues.apache.org/jira/browse/DRILL-5601?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16093825#comment-16093825
]
ASF GitHub Bot commented on DRILL-5601:
---------------------------------------
Github user paul-rogers commented on a diff in the pull request:
https://github.com/apache/drill/pull/860#discussion_r128365255
--- Diff:
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/spill/RecordBatchSizer.java
---
@@ -19,117 +19,162 @@
import java.util.ArrayList;
import java.util.List;
+import java.util.Set;
+import org.apache.drill.common.types.TypeProtos.DataMode;
import org.apache.drill.common.types.TypeProtos.MinorType;
import org.apache.drill.exec.expr.TypeHelper;
+import org.apache.drill.exec.memory.AllocationManager.BufferLedger;
import org.apache.drill.exec.memory.BaseAllocator;
import org.apache.drill.exec.record.BatchSchema;
import org.apache.drill.exec.record.MaterializedField;
import org.apache.drill.exec.record.RecordBatch;
+import org.apache.drill.exec.record.SmartAllocationHelper;
import org.apache.drill.exec.record.VectorAccessible;
import org.apache.drill.exec.record.VectorWrapper;
import org.apache.drill.exec.record.selection.SelectionVector2;
+import org.apache.drill.exec.vector.UInt4Vector;
import org.apache.drill.exec.vector.ValueVector;
import org.apache.drill.exec.vector.complex.AbstractMapVector;
+import org.apache.drill.exec.vector.complex.RepeatedValueVector;
import org.apache.drill.exec.vector.VariableWidthVector;
+import com.google.common.collect.Sets;
+
/**
* Given a record batch or vector container, determines the actual memory
* consumed by each column, the average row, and the entire record batch.
*/
public class RecordBatchSizer {
-// private static final org.slf4j.Logger logger =
org.slf4j.LoggerFactory.getLogger(RecordBatchSizer.class);
/**
* Column size information.
*/
public static class ColumnSize {
+ public final String prefix;
public final MaterializedField metadata;
/**
- * Assumed size from Drill metadata.
+ * Assumed size from Drill metadata. Note that this information is
+ * 100% bogus. Do not use it.
*/
+ @Deprecated
public int stdSize;
/**
- * Actual memory consumed by all the vectors associated with this
column.
- */
-
- public int totalSize;
-
- /**
* Actual average column width as determined from actual memory use.
This
* size is larger than the actual data size since this size includes
per-
* column overhead such as any unused vector space, etc.
*/
- public int estSize;
- public int capacity;
- public int density;
- public int dataSize;
- public boolean variableWidth;
-
- public ColumnSize(ValueVector vv) {
- metadata = vv.getField();
- stdSize = TypeHelper.getSize(metadata.getType());
-
- // Can't get size estimates if this is an empty batch.
- int rowCount = vv.getAccessor().getValueCount();
- if (rowCount == 0) {
- estSize = stdSize;
- return;
- }
+ public final int estSize;
+ public final int valueCount;
+ public final int entryCount;
+ public final int dataSize;
+ public final int estElementCount;
+ public final boolean isVariableWidth;
- // Total size taken by all vectors (and underlying buffers)
- // associated with this vector.
+ public ColumnSize(ValueVector v, String prefix, int valueCount) {
+ this.prefix = prefix;
+ this.valueCount = valueCount;
+ metadata = v.getField();
+ boolean isMap = v.getField().getType().getMinorType() ==
MinorType.MAP;
- totalSize = vv.getAllocatedByteCount();
+ // The amount of memory consumed by the payload: the actual
+ // data stored in the vectors.
- // Capacity is the number of values that the vector could
- // contain. This is useful only for fixed-length vectors.
+ int mapSize = 0;
+ if (v.getField().getDataMode() == DataMode.REPEATED) {
- capacity = vv.getValueCapacity();
+ // Repeated vectors are special: they have an associated offset
vector
+ // that changes the value count of the contained vectors.
- // The amount of memory consumed by the payload: the actual
- // data stored in the vectors.
+ @SuppressWarnings("resource")
+ UInt4Vector offsetVector = ((RepeatedValueVector)
v).getOffsetVector();
+ entryCount = offsetVector.getAccessor().get(valueCount);
+ estElementCount = roundUp(entryCount, valueCount);
+ if (isMap) {
- dataSize = vv.getPayloadByteCount();
+ // For map, the only data associated with the map vector
+ // itself is the offset vector, if any.
- // Determine "density" the number of rows compared to potential
- // capacity. Low-density batches occur at block boundaries, ends
- // of files and so on. Low-density batches throw off our estimates
- // for Varchar columns because we don't know the actual number of
- // bytes consumed (that information is hidden behind the Varchar
- // implementation where we can't get at it.)
+ mapSize = offsetVector.getPayloadByteCount(valueCount);
+ }
+ } else {
+ entryCount = valueCount;
+ estElementCount = 1;
+ }
- density = roundUp(dataSize * 100, totalSize);
- estSize = roundUp(dataSize, rowCount);
- variableWidth = vv instanceof VariableWidthVector ;
+ if (isMap) {
+ dataSize = mapSize;
+ stdSize = 0;
+ } else {
+ dataSize = v.getPayloadByteCount(valueCount);
+ stdSize = TypeHelper.getSize(metadata.getType());
+ }
+ estSize = roundUp(dataSize, valueCount);
+ isVariableWidth = v instanceof VariableWidthVector ;
}
@Override
public String toString() {
StringBuilder buf = new StringBuilder()
+ .append(prefix)
.append(metadata.getName())
.append("(type: ")
+ .append(metadata.getType().getMode().name())
+ .append(" ")
.append(metadata.getType().getMinorType().name())
- .append(", std col. size: ")
+ .append(", count: ")
+ .append(valueCount);
+ if (metadata.getDataMode() == DataMode.REPEATED) {
+ buf.append(", total entries: ")
+ .append(entryCount)
+ .append(", per-array: ")
+ .append(estElementCount);
+ }
+ buf .append(", std size: ")
.append(stdSize)
- .append(", actual col. size: ")
+ .append(", actual size: ")
.append(estSize)
- .append(", total size: ")
- .append(totalSize)
.append(", data size: ")
.append(dataSize)
- .append(", row capacity: ")
- .append(capacity)
- .append(", density: ")
- .append(density)
.append(")");
return buf.toString();
}
+
+ public void buildAllocHelper(SmartAllocationHelper allocHelper) {
+ int width = 0;
+ switch(metadata.getType().getMinorType()) {
+ case VAR16CHAR:
+ case VARBINARY:
+ case VARCHAR:
+
+ // Subtract out the offset vector width
+ width = estSize - 4;
+
+ // Subtract out the bits (is-set) vector width
+ if (metadata.getDataMode() == DataMode.OPTIONAL) {
+ width -= 1;
+ }
+ break;
+ default:
+ break;
+ }
+ String name = prefix + metadata.getName();
+ if (metadata.getDataMode() == DataMode.REPEATED) {
+ if (width > 0) {
+ allocHelper.variableWidthArray(name, width, estElementCount);
--- End diff --
This was an early draft; since revised.
> Rollup of External Sort memory management fixes
> -----------------------------------------------
>
> Key: DRILL-5601
> URL: https://issues.apache.org/jira/browse/DRILL-5601
> Project: Apache Drill
> Issue Type: Task
> Affects Versions: 1.11.0
> Reporter: Paul Rogers
> Assignee: Paul Rogers
> Fix For: 1.12.0
>
>
> Rollup of a set of specific JIRA entries that all relate to the very
> difficult problem of managing memory within Drill in order for the external
> sort to stay within a memory budget. In general, the fixes relate to better
> estimating memory used by the three ways that Drill allocates vector memory
> (see DRILL-5522) and to predicting the size of vectors that the sort will
> create, to avoid repeated realloc-copy cycles (see DRILL-5594).
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)