kgyrtkirk commented on a change in pull request #1250: URL: https://github.com/apache/hive/pull/1250#discussion_r454349711
########## File path: ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java ########## @@ -358,6 +360,34 @@ public void close(boolean aborted) throws HiveException { */ private long numRowsCompareHashAggr; + /** + * To track current memory usage. + */ + private long currMemUsed; + + /** + * Whether to make use of LRUCache for map aggr buffers or not. + */ + private boolean lruCache; + + class LRUCache extends LinkedHashMap<KeyWrapper, VectorAggregationBufferRow> { + + @Override + protected boolean removeEldestEntry(Map.Entry<KeyWrapper, VectorAggregationBufferRow> eldest) { + if (currMemUsed > maxHashTblMemory || size() > maxHtEntries || gcCanary.get() == null) { Review comment: this method seems to have been polluted by the "isFull" logic - which is unexpected with this method name the "isFull" should be moved outside - and remove should only called when the condition is met ########## File path: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java ########## @@ -4065,6 +4065,9 @@ private static void populateLlapDaemonVarsSet(Set<String> llapDaemonVarsSetLocal HIVE_VECTORIZATION_GROUPBY_MAXENTRIES("hive.vectorized.groupby.maxentries", 1000000, "Max number of entries in the vector group by aggregation hashtables. \n" + "Exceeding this will trigger a flush irrelevant of memory pressure condition."), + HIVE_VECTORIZATION_GROUPBY_ENABLE_LRU_FOR_AGGR( Review comment: instead of introducing a boolean toggle; add a mode switch (default/lru/etc) ########## File path: ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java ########## @@ -420,35 +460,56 @@ public void doProcessBatch(VectorizedRowBatch batch, boolean isFirstGroupingSet, //Flush if memory limits were reached // We keep flushing until the memory is under threshold int preFlushEntriesCount = numEntriesHashTable; - while (shouldFlush(batch)) { - flush(false); - if(gcCanary.get() == null) { - gcCanaryFlushes++; - gcCanary = new SoftReference<Object>(new Object()); - } + if (!lruCache) { + while (shouldFlush(batch)) { + flush(false); + + if(gcCanary.get() == null) { + gcCanaryFlushes++; + gcCanary = new SoftReference<Object>(new Object()); + } - //Validate that some progress is being made - if (!(numEntriesHashTable < preFlushEntriesCount)) { - if (LOG.isDebugEnabled()) { - LOG.debug(String.format("Flush did not progress: %d entries before, %d entries after", - preFlushEntriesCount, - numEntriesHashTable)); + //Validate that some progress is being made + if (!(numEntriesHashTable < preFlushEntriesCount)) { + if (LOG.isDebugEnabled()) { + LOG.debug(String.format("Flush did not progress: %d entries before, %d entries after", + preFlushEntriesCount, + numEntriesHashTable)); + } + break; } - break; + preFlushEntriesCount = numEntriesHashTable; } - preFlushEntriesCount = numEntriesHashTable; + } else { + checkAndFlushLRU(batch); } if (sumBatchSize == 0 && 0 != batch.size) { // Sample the first batch processed for variable sizes. updateAvgVariableSize(batch); + currMemUsed = numEntriesHashTable * (fixedHashEntrySize + avgVariableSize); Review comment: this is strange...there is a `currMemUsed` field an there is also a `currMemUsed` local variable in `shouldFlush` - they might cause things to me more interesting :) ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org For additional commands, e-mail: gitbox-h...@hive.apache.org