Github user paul-rogers commented on a diff in the pull request: https://github.com/apache/drill/pull/860#discussion_r128129255 --- Diff: exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/xsort/managed/PriorityQueueCopierWrapper.java --- @@ -245,29 +250,35 @@ private BatchMerger(PriorityQueueCopierWrapper holder, BatchSchema schema, List< @Override public boolean next() { - Stopwatch w = Stopwatch.createStarted(); long start = holder.getAllocator().getAllocatedMemory(); + + // Allocate an outgoing container the "dumb" way (based on static sizes) + // for testing, or the "smart" way (based on actual observed data sizes) + // for production code. + + if (allocHelper == null) { + VectorAccessibleUtilities.allocateVectors(outputContainer, targetRecordCount); + } else { + allocHelper.allocateBatch(outputContainer, targetRecordCount); + } + logger.trace("Initial output batch allocation: {} bytes", + holder.getAllocator().getAllocatedMemory() - start); + Stopwatch w = Stopwatch.createStarted(); int count = holder.copier.next(targetRecordCount); - copyCount += count; if (count > 0) { long t = w.elapsed(TimeUnit.MICROSECONDS); batchCount++; - logger.trace("Took {} us to merge {} records", t, count); long size = holder.getAllocator().getAllocatedMemory() - start; + logger.trace("Took {} us to merge {} records, consuming {} bytes of memory", + t, count, size); estBatchSize = Math.max(estBatchSize, size); } else { logger.trace("copier returned 0 records"); } - // Identify the schema to be used in the output container. (Since - // all merged batches have the same schema, the schema we identify - // here should be the same as that which we already had. + // Initialize output container metadata. --- End diff -- They were actually a bit off the mark and reflected a partial understanding. The very nature of the buildSchema() is just to copy schema from vectors into the schema for the batch; it has nothing (directly) to do with the schema of incoming batches.
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---