This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 5412688cd69 [SPARK-39928][CORE] Optimize `Utils.getIteratorSize` for Scala 2.13 refer to `IterableOnceOps.size` 5412688cd69 is described below commit 5412688cd69792d804ba04ef43104e58c208e26c Author: yangjie01 <yangji...@baidu.com> AuthorDate: Wed Aug 3 08:38:14 2022 -0500 [SPARK-39928][CORE] Optimize `Utils.getIteratorSize` for Scala 2.13 refer to `IterableOnceOps.size` ### What changes were proposed in this pull request? `Utils.getIteratorSize` is slightly slower than `Iterator.size` when using Scala 2.13, but it seems that we can't call `Iterator.size` directly, because `Utils.getIteratorSize` method returns `Long` type data, using `Iterator.size` directly can't ensure that overflow will not occur. So this pr adds `if (iterator.knownSize > 0)` conditional branches to optimize this method when using Scala 2.13, this optimization way refers to `IterableOnceOps.size`: https://github.com/scala/scala/blob/cbc012e573346dc685c478eec5fcbb56d22ea884/src/library/scala/collection/IterableOnce.scala#L835-L849 <img width="553" alt="image" src="https://user-images.githubusercontent.com/1475305/182060742-c56ead62-4164-4633-a501-48e2e3151752.png"> This optimization way is only suitable for Scala 2.13 and before using Scala 2.13, `Utils.getIteratorSize` is not slower than `Iterator.size`. ### Why are the changes needed? Optimize `Utils.getIteratorSize` for Scala 2.13. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? - Pass GitHub Actions Closes #37353 from LuciferYang/Optimize-getIteratorSize-213. Authored-by: yangjie01 <yangji...@baidu.com> Signed-off-by: Sean Owen <sro...@gmail.com> --- .../org/apache/spark/util/Iterators.scala | 35 +++++++++++++++++++ .../org/apache/spark/util/Iterators.scala | 40 ++++++++++++++++++++++ .../main/scala/org/apache/spark/util/Utils.scala | 13 ++----- 3 files changed, 77 insertions(+), 11 deletions(-) diff --git a/core/src/main/scala-2.12/org/apache/spark/util/Iterators.scala b/core/src/main/scala-2.12/org/apache/spark/util/Iterators.scala new file mode 100644 index 00000000000..59530b4ba27 --- /dev/null +++ b/core/src/main/scala-2.12/org/apache/spark/util/Iterators.scala @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.util + +object Iterators { + + /** + * Counts the number of elements of an iterator using a while loop rather than calling + * [[scala.collection.Iterator#size]] because it uses a for loop, which is slightly slower + * in the current version of Scala. + */ + def size(iterator: Iterator[_]): Long = { + var count = 0L + while (iterator.hasNext) { + count += 1L + iterator.next() + } + count + } +} diff --git a/core/src/main/scala-2.13/org/apache/spark/util/Iterators.scala b/core/src/main/scala-2.13/org/apache/spark/util/Iterators.scala new file mode 100644 index 00000000000..0ffba8d13a3 --- /dev/null +++ b/core/src/main/scala-2.13/org/apache/spark/util/Iterators.scala @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.util + +object Iterators { + + /** + * Counts the number of elements of an iterator. + * This method is slower than `iterator.size` when using Scala 2.13, + * but it can avoid overflowing problem. + */ + def size(iterator: Iterator[_]): Long = { + // SPARK-39928: For Scala 2.13, add check of `iterator.knownSize` refer to + // `IterableOnceOps#size` to reduce the performance gap with `iterator.size`. + if (iterator.knownSize > 0) iterator.knownSize.toLong + else { + var count = 0L + while (iterator.hasNext) { + count += 1L + iterator.next() + } + count + } + } +} diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 56392f0239c..aef79c7882c 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -1908,18 +1908,9 @@ private[spark] object Utils extends Logging { } /** - * Counts the number of elements of an iterator using a while loop rather than calling - * [[scala.collection.Iterator#size]] because it uses a for loop, which is slightly slower - * in the current version of Scala. + * Counts the number of elements of an iterator. */ - def getIteratorSize(iterator: Iterator[_]): Long = { - var count = 0L - while (iterator.hasNext) { - count += 1L - iterator.next() - } - count - } + def getIteratorSize(iterator: Iterator[_]): Long = Iterators.size(iterator) /** * Generate a zipWithIndex iterator, avoid index value overflowing problem --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org