liupc commented on a change in pull request #27968:
URL: https://github.com/apache/spark/pull/27968#discussion_r424930338



##########
File path: core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
##########
@@ -290,6 +306,88 @@ object SizeEstimator extends Logging {
     size
   }
 
+
+  /** Visit AppendOnlyMap data field which stored all the KVs, we handle this 
field separately
+   *  because the underlying type of the elems of this array is different, and 
their size may vary
+   *  significantly, for example, the value may be an array-like buffer to 
store merged or grouped
+   *  values for aggregation.
+   * */
+  private def visitKVDataArray(
+      data: Array[AnyRef],
+      keyPositions: java.util.BitSet,
+      totalValueElements: Int,
+      state: SearchState): Unit = {
+    val length = data.length
+    var arrSize: Long = alignSize(objectSize + INT_SIZE)
+    state.size += arrSize
+    state.size += alignSize((length - keyPositions.size) * pointerSize)
+
+    if (length <= ARRAY_SIZE_FOR_SAMPLING) {
+      for (e <- data) {
+        state.enqueue(e)
+      }
+    } else {
+      val rand = new Random(42)
+      val drawn = new OpenHashSet[Int](2 * ARRAY_SAMPLE_SIZE)
+      val (numKeys1, keySize1, numValueElements1, valueSize1) =
+        sampleKVDataArray(data, keyPositions, state, rand, drawn, length)
+      val (numKeys2, keySize2, numValueElements2, valueSize2) =
+        sampleKVDataArray(data, keyPositions, state, rand, drawn, length)
+      val (_, keySizeForMax, numKeysForMin, keySizeForMin) = if (keySize1 > 
keySize2) {
+        (numKeys1, keySize1, numKeys2, keySize2)
+      } else (numKeys2, keySize2, numKeys1, keySize1)
+      val keySize = keySizeForMax + (keySizeForMin *
+          ((keyPositions.cardinality() - numKeysForMin).toDouble / 
numKeysForMin)).toLong
+      state.size += keySize

Review comment:
       No, the key usually should be non-complex types, but it can be non-fixed 
length types like strings.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to