Repository: spark
Updated Branches:
  refs/heads/master 4ba267356 -> f25bbbdb3


[SPARK-3280] Made sort-based shuffle the default implementation

Sort-based shuffle has lower memory usage and seems to outperform hash-based in 
almost all of our testing.

Author: Reynold Xin <[email protected]>

Closes #2178 from rxin/sort-shuffle and squashes the following commits:

713d341 [Reynold Xin] Fixed test failures by setting spark.shuffle.compress to 
the same value as spark.shuffle.spill.compress.
85165e6 [Reynold Xin] Fixed a comment typo.
aa0d372 [Reynold Xin] [SPARK-3280] Made sort-based shuffle the default 
implementation


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f25bbbdb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f25bbbdb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f25bbbdb

Branch: refs/heads/master
Commit: f25bbbdb3ac5620850c7d09d6a63af888411ecf1
Parents: 4ba2673
Author: Reynold Xin <[email protected]>
Authored: Sun Sep 7 20:42:07 2014 -0700
Committer: Reynold Xin <[email protected]>
Committed: Sun Sep 7 20:42:07 2014 -0700

----------------------------------------------------------------------
 .../main/scala/org/apache/spark/SparkEnv.scala  |  2 +-
 .../org/apache/spark/HashShuffleSuite.scala     | 33 ++++++++++++++++++++
 .../scala/org/apache/spark/ShuffleSuite.scala   |  2 +-
 .../org/apache/spark/SortShuffleSuite.scala     |  3 +-
 .../collection/ExternalAppendOnlyMapSuite.scala |  1 +
 docs/configuration.md                           |  9 +++---
 6 files changed, 41 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/f25bbbdb/core/src/main/scala/org/apache/spark/SparkEnv.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala 
b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index 2973d00..20a7444 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -217,7 +217,7 @@ object SparkEnv extends Logging {
     val shortShuffleMgrNames = Map(
       "hash" -> "org.apache.spark.shuffle.hash.HashShuffleManager",
       "sort" -> "org.apache.spark.shuffle.sort.SortShuffleManager")
-    val shuffleMgrName = conf.get("spark.shuffle.manager", "hash")
+    val shuffleMgrName = conf.get("spark.shuffle.manager", "sort")
     val shuffleMgrClass = 
shortShuffleMgrNames.getOrElse(shuffleMgrName.toLowerCase, shuffleMgrName)
     val shuffleManager = instantiateClass[ShuffleManager](shuffleMgrClass)
 

http://git-wip-us.apache.org/repos/asf/spark/blob/f25bbbdb/core/src/test/scala/org/apache/spark/HashShuffleSuite.scala
----------------------------------------------------------------------
diff --git a/core/src/test/scala/org/apache/spark/HashShuffleSuite.scala 
b/core/src/test/scala/org/apache/spark/HashShuffleSuite.scala
new file mode 100644
index 0000000..2acc02a
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/HashShuffleSuite.scala
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark
+
+import org.scalatest.BeforeAndAfterAll
+
+class HashShuffleSuite extends ShuffleSuite with BeforeAndAfterAll {
+
+  // This test suite should run all tests in ShuffleSuite with hash-based 
shuffle.
+
+  override def beforeAll() {
+    System.setProperty("spark.shuffle.manager", "hash")
+  }
+
+  override def afterAll() {
+    System.clearProperty("spark.shuffle.manager")
+  }
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/f25bbbdb/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
----------------------------------------------------------------------
diff --git a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala 
b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
index b13ddf9..15aa4d8 100644
--- a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
@@ -26,7 +26,7 @@ import org.apache.spark.rdd.{CoGroupedRDD, 
OrderedRDDFunctions, RDD, ShuffledRDD
 import org.apache.spark.serializer.KryoSerializer
 import org.apache.spark.util.MutablePair
 
-class ShuffleSuite extends FunSuite with Matchers with LocalSparkContext {
+abstract class ShuffleSuite extends FunSuite with Matchers with 
LocalSparkContext {
 
   val conf = new SparkConf(loadDefaults = false)
 

http://git-wip-us.apache.org/repos/asf/spark/blob/f25bbbdb/core/src/test/scala/org/apache/spark/SortShuffleSuite.scala
----------------------------------------------------------------------
diff --git a/core/src/test/scala/org/apache/spark/SortShuffleSuite.scala 
b/core/src/test/scala/org/apache/spark/SortShuffleSuite.scala
index 5c02c00..639e56c 100644
--- a/core/src/test/scala/org/apache/spark/SortShuffleSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SortShuffleSuite.scala
@@ -24,8 +24,7 @@ class SortShuffleSuite extends ShuffleSuite with 
BeforeAndAfterAll {
   // This test suite should run all tests in ShuffleSuite with sort-based 
shuffle.
 
   override def beforeAll() {
-    System.setProperty("spark.shuffle.manager",
-      "org.apache.spark.shuffle.sort.SortShuffleManager")
+    System.setProperty("spark.shuffle.manager", "sort")
   }
 
   override def afterAll() {

http://git-wip-us.apache.org/repos/asf/spark/blob/f25bbbdb/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
----------------------------------------------------------------------
diff --git 
a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
 
b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
index ac3931e..511d76c 100644
--- 
a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
+++ 
b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
@@ -42,6 +42,7 @@ class ExternalAppendOnlyMapSuite extends FunSuite with 
LocalSparkContext {
     conf.set("spark.serializer.objectStreamReset", "1")
     conf.set("spark.serializer", "org.apache.spark.serializer.JavaSerializer")
     conf.set("spark.shuffle.spill.compress", codec.isDefined.toString)
+    conf.set("spark.shuffle.compress", codec.isDefined.toString)
     codec.foreach { c => conf.set("spark.io.compression.codec", c) }
     // Ensure that we actually have multiple batches per spill file
     conf.set("spark.shuffle.spill.batchSize", "10")

http://git-wip-us.apache.org/repos/asf/spark/blob/f25bbbdb/docs/configuration.md
----------------------------------------------------------------------
diff --git a/docs/configuration.md b/docs/configuration.md
index 65a422c..36178ef 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -293,12 +293,11 @@ Apart from these, the following properties are also 
available, and may be useful
 </tr>
 <tr>
   <td><code>spark.shuffle.manager</code></td>
-  <td>HASH</td>
+  <td>sort</td>
   <td>
-    Implementation to use for shuffling data. A hash-based shuffle manager is 
the default, but
-    starting in Spark 1.1 there is an experimental sort-based shuffle manager 
that is more 
-    memory-efficient in environments with small executors, such as YARN. To 
use that, change
-    this value to <code>SORT</code>.
+    Implementation to use for shuffling data. There are two implementations 
available:
+    <code>sort</code> and <code>hash</code>. Sort-based shuffle is more 
memory-efficient and is
+    the default option starting in 1.2.
   </td>
 </tr>
 <tr>


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to