This is an automated email from the ASF dual-hosted git repository.
kgyrtkirk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new c4f6eb2f914 HIVE-26184: COLLECT_SET with GROUP BY is very slow when
some keys are highly skewed (#3253) (okumin reviewed by Zoltan Haindrich)
c4f6eb2f914 is described below
commit c4f6eb2f91478152c89070a7455df9a1b8980c75
Author: okumin <[email protected]>
AuthorDate: Tue Jun 14 00:26:36 2022 +0900
HIVE-26184: COLLECT_SET with GROUP BY is very slow when some keys are
highly skewed (#3253) (okumin reviewed by Zoltan Haindrich)
---
.../udf/generic/GenericUDAFMkCollectionEvaluator.java | 18 +++++++++++++++++-
1 file changed, 17 insertions(+), 1 deletion(-)
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMkCollectionEvaluator.java
b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMkCollectionEvaluator.java
index b05023dd37a..cffc7f76510 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMkCollectionEvaluator.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMkCollectionEvaluator.java
@@ -95,11 +95,27 @@ public class GenericUDAFMkCollectionEvaluator extends
GenericUDAFEvaluator
throw new RuntimeException("Buffer type unknown");
}
}
+
+ private void reset() {
+ if (bufferType == BufferType.LIST) {
+ container.clear();
+ } else if (bufferType == BufferType.SET) {
+ // Don't reuse a container because HashSet#clear can be very slow. The
operation takes O(N)
+ // and N is the capacity of the internal hash table. The internal
capacity grows based on
+ // the number of elements and it never shrinks. Thus, HashSet#clear
takes O(N) every time
+ // once a skewed key appears.
+ // In order to avoid too many resizing in average cases, we set the
initial capacity to the
+ // number of elements of the previous aggregation.
+ container = new LinkedHashSet<>(container.size());
+ } else {
+ throw new RuntimeException("Buffer type unknown");
+ }
+ }
}
@Override
public void reset(AggregationBuffer agg) throws HiveException {
- ((MkArrayAggregationBuffer) agg).container.clear();
+ ((MkArrayAggregationBuffer) agg).reset();
}
@Override