Repository: spark Updated Branches: refs/heads/master 1b2c2162a -> 834651835
[SPARK-12598][CORE] bug in setMinPartitions There is a bug in the calculation of ```maxSplitSize```. The ```totalLen``` should be divided by ```minPartitions``` and not by ```files.size```. Author: Darek Blasiak <darek.blas...@640labs.com> Closes #10546 from datafarmer/setminpartitionsbug. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/83465183 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/83465183 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/83465183 Branch: refs/heads/master Commit: 8346518357f4a3565ae41e9a5ccd7e2c3ed6c468 Parents: 1b2c216 Author: Darek Blasiak <darek.blas...@640labs.com> Authored: Thu Jan 7 21:15:40 2016 +0000 Committer: Sean Owen <so...@cloudera.com> Committed: Thu Jan 7 21:15:40 2016 +0000 ---------------------------------------------------------------------- .../main/scala/org/apache/spark/input/PortableDataStream.scala | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/83465183/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala b/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala index 8009491..18cb763 100644 --- a/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala +++ b/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala @@ -41,9 +41,8 @@ private[spark] abstract class StreamFileInputFormat[T] * which is set through setMaxSplitSize */ def setMinPartitions(context: JobContext, minPartitions: Int) { - val files = listStatus(context).asScala - val totalLen = files.map(file => if (file.isDirectory) 0L else file.getLen).sum - val maxSplitSize = Math.ceil(totalLen * 1.0 / files.size).toLong + val totalLen = listStatus(context).asScala.filterNot(_.isDirectory).map(_.getLen).sum + val maxSplitSize = math.ceil(totalLen / math.max(minPartitions, 1.0)).toLong super.setMaxSplitSize(maxSplitSize) } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org