Github user paul-rogers commented on a diff in the pull request: https://github.com/apache/drill/pull/1101#discussion_r165566478 --- Diff: exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/aggregate/HashAggTemplate.java --- @@ -516,43 +501,48 @@ private void initializeSetup(RecordBatch newIncoming) throws SchemaChangeExcepti * @param incoming */ private void updateEstMaxBatchSize(RecordBatch incoming) { - if ( estMaxBatchSize > 0 ) { return; } // no handling of a schema (or varchar) change // Use the sizer to get the input row width and the length of the longest varchar column - RecordBatchSizer sizer = new RecordBatchSizer(incoming); - logger.trace("Incoming sizer: {}",sizer); - // An empty batch only has the schema, can not tell actual length of varchars - // else use the actual varchars length, each capped at 50 (to match the space allocation) - long estInputRowWidth = sizer.rowCount() == 0 ? sizer.stdRowWidth() : sizer.netRowWidthCap50(); + final RecordBatchSizer incomingColumnSizes = new RecordBatchSizer(incoming); + final Map<String, RecordBatchSizer.ColumnSize> columnSizeMap = incomingColumnSizes.getColumnSizeMap(); + keySizes = CaseInsensitiveMap.newHashMap(); - // Get approx max (varchar) column width to get better memory allocation - maxColumnWidth = Math.max(sizer.maxAvgColumnSize(), VARIABLE_MIN_WIDTH_VALUE_SIZE); - maxColumnWidth = Math.min(maxColumnWidth, VARIABLE_MAX_WIDTH_VALUE_SIZE); + logger.trace("Incoming sizer: {}",incomingColumnSizes); - // // Calculate the estimated max (internal) batch (i.e. Keys batch + Values batch) size // (which is used to decide when to spill) // Also calculate the values batch size (used as a reserve to overcome an OOM) - // - Iterator<VectorWrapper<?>> outgoingIter = outContainer.iterator(); - int fieldId = 0; - while (outgoingIter.hasNext()) { - ValueVector vv = outgoingIter.next().getValueVector(); - MaterializedField mr = vv.getField(); - int fieldSize = vv instanceof VariableWidthVector ? maxColumnWidth : - TypeHelper.getSize(mr.getType()); - estRowWidth += fieldSize; - estOutputRowWidth += fieldSize; - if ( fieldId < numGroupByOutFields ) { fieldId++; } - else { estValuesRowWidth += fieldSize; } + + for (int columnIndex = 0; columnIndex < numGroupByOutFields; columnIndex++) { + final VectorWrapper vectorWrapper = outContainer.getValueVector(columnIndex); + final String columnName = vectorWrapper.getField().getName(); + final int columnSize = columnSizeMap.get(columnName).estSize; + keySizes.put(columnName, columnSize); + estOutputRowWidth += columnSize; } + + long estValuesRowWidth = 0; + + for (int columnIndex = numGroupByOutFields; columnIndex < outContainer.getNumberOfColumns(); columnIndex++) { + VectorWrapper vectorWrapper = outContainer.getValueVector(columnIndex); + RecordBatchSizer.ColumnSize columnSize = new RecordBatchSizer.ColumnSize(vectorWrapper.getValueVector()); + estOutputRowWidth += columnSize.estSize; --- End diff -- The columns in the output batch for the hash agg are usually required numerics (fixed width). But, the can be nullable in some cases (talk to Boaz.) If you are doing a max or min on a string, then the output will be a VarChar, so you have to account for the offset and data vectors. If you want to know how much memory a vector takes after creating it, just measure memory before and after the vector allocation. If you want to predict the size to do an allocation, then for fixed width you just need a row count. For VarChar, you need to predict the width also. You only need the "sizer" when trying to predict a vector size before you create it, and the vector has variable-width or complex parts.
---