Github user paul-rogers commented on a diff in the pull request:
https://github.com/apache/drill/pull/1091#discussion_r162225832
--- Diff:
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/flatten/FlattenRecordBatch.java
---
@@ -94,8 +98,54 @@ private void clear() {
}
}
+ private class FlattenMemoryManager {
+ private final int outputRowCount;
+ private static final int OFFSET_VECTOR_WIDTH = 4;
+ private static final int WORST_CASE_FRAGMENTATION_FACTOR = 2;
+ private static final int MAX_NUM_ROWS = 64 * 1024;
+ private static final int MIN_NUM_ROWS = 1;
+
+ private FlattenMemoryManager(RecordBatch incoming, long
outputBatchSize, SchemaPath flattenColumn) {
+ // Get sizing information for the batch.
+ RecordBatchSizer sizer = new RecordBatchSizer(incoming);
+
+ final TypedFieldId typedFieldId =
incoming.getValueVectorId(flattenColumn);
+ final MaterializedField field =
incoming.getSchema().getColumn(typedFieldId.getFieldIds()[0]);
+
+ // Get column size of flatten column.
+ RecordBatchSizer.ColumnSize columnSize =
sizer.getColumn(incoming.getValueAccessorById(field.getValueClass(),
+ typedFieldId.getFieldIds()).getValueVector(), field.getName());
+
+ // Average rowWidth of flatten column
+ final int avgRowWidthFlattenColumn =
RecordBatchSizer.safeDivide(columnSize.dataSize, incoming.getRecordCount());
+
+ // Average rowWidth excluding the flatten column.
+ final int avgRowWidthWithOutFlattenColumn = sizer.netRowWidth() -
avgRowWidthFlattenColumn;
+
+ // Average rowWidth of single element in the flatten list.
+ // subtract the offset vector size from column data size.
+ final int avgRowWidthSingleFlattenEntry =
+ RecordBatchSizer.safeDivide(columnSize.dataSize -
(OFFSET_VECTOR_WIDTH * columnSize.valueCount), columnSize.elementCount);
+
+ // Average rowWidth of outgoing batch.
+ final int avgOutgoingRowWidth = avgRowWidthWithOutFlattenColumn +
avgRowWidthSingleFlattenEntry;
+
+ // Number of rows in outgoing batch
+ outputRowCount = Math.max(MIN_NUM_ROWS, Math.min(MAX_NUM_ROWS,
+
RecordBatchSizer.safeDivide((outputBatchSize/WORST_CASE_FRAGMENTATION_FACTOR),
avgOutgoingRowWidth)));
+ }
--- End diff --
Would be great to log this info in debug mode to aid in tuning. Doing this
in the sort proved highly valuable.
---