Github user a-roberts commented on a diff in the pull request:
https://github.com/apache/spark/pull/16196#discussion_r91305043
--- Diff: core/src/main/scala/org/apache/spark/util/SizeEstimator.scala ---
@@ -243,47 +253,59 @@ object SizeEstimator extends Logging {
arrSize += alignSize(length.toLong * primitiveSize(elementClass))
state.size += arrSize
} else {
+ // We know that the array we are dealing with is an array of
references
+ // so explicitly expose this type so we can directly manipulate the
array
+ // without help form the Scala runtime for efficency
arrSize += alignSize(length.toLong * pointerSize)
state.size += arrSize
+ val objArray = array.asInstanceOf[Array[AnyRef]]
+
if (length <= ARRAY_SIZE_FOR_SAMPLING) {
var arrayIndex = 0
while (arrayIndex < length) {
- state.enqueue(ScalaRunTime.array_apply(array,
arrayIndex).asInstanceOf[AnyRef])
+ state.enqueue(objArray(arrayIndex))
arrayIndex += 1
}
} else {
// Estimate the size of a large array by sampling elements without
replacement.
// To exclude the shared objects that the array elements may link,
sample twice
- // and use the min one to calculate array size.
- val rand = new Random(42)
+ // and use the min one to calculate array size.
+ // Use ThreadLocalRandom here since the random is only accessed
from 1 thread
+ // and we can save the overhead of the full thread-safe Random
+ val rand = ThreadLocalRandom.current
val drawn = new OpenHashSet[Int](2 * ARRAY_SAMPLE_SIZE)
- val s1 = sampleArray(array, state, rand, drawn, length)
- val s2 = sampleArray(array, state, rand, drawn, length)
+ val s1 = sampleArray(objArray, state, rand, drawn, length)
+ val s2 = sampleArray(objArray, state, rand, drawn, length)
val size = math.min(s1, s2)
+
state.size += math.max(s1, s2) +
(size * ((length - ARRAY_SAMPLE_SIZE) /
(ARRAY_SAMPLE_SIZE))).toLong
}
}
}
private def sampleArray(
- array: AnyRef,
+ array: Array[AnyRef],
state: SearchState,
- rand: Random,
+ rand: ThreadLocalRandom,
drawn: OpenHashSet[Int],
length: Int): Long = {
var size = 0L
- for (i <- 0 until ARRAY_SAMPLE_SIZE) {
+ // avoid the use of an iterator derrived from the range syntax here
for performance
+ var count = 0
+ val end = ARRAY_SAMPLE_SIZE
+ while (count <= end) {
--- End diff --
Ah yes, should be just < not <=, will add into the next commit
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]