[3/3] git commit: Adding back hive support
Adding back hive support Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e8ca3970 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e8ca3970 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e8ca3970 Branch: refs/heads/branch-1.0 Commit: e8ca3970c705bf5b63a8d6d587c6fb1a2ea46abe Parents: fdadad0 Author: Patrick Wendell Authored: Wed May 14 10:21:27 2014 -0700 Committer: Patrick Wendell Committed: Wed May 14 10:21:27 2014 -0700 -- dev/create-release/create-release.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e8ca3970/dev/create-release/create-release.sh -- diff --git a/dev/create-release/create-release.sh b/dev/create-release/create-release.sh index 4f42afc..c4e7499 100755 --- a/dev/create-release/create-release.sh +++ b/dev/create-release/create-release.sh @@ -109,9 +109,9 @@ make_binary_release() { spark-$RELEASE_VERSION-bin-$NAME.tgz.sha } -make_binary_release "hadoop1" "--hadoop 1.0.4" -make_binary_release "cdh4" "--hadoop 2.0.0-mr1-cdh4.2.0" -make_binary_release "hadoop2" "--with-yarn --hadoop 2.2.0" +make_binary_release "hadoop1" "--with-hive --hadoop 1.0.4" +make_binary_release "cdh4" "--with-hive --hadoop 2.0.0-mr1-cdh4.2.0" +make_binary_release "hadoop2" "--with-hive --with-yarn --hadoop 2.2.0" # Copy data echo "Copying release tarballs"
git commit: Typo fix: fetchting -> fetching
Repository: spark Updated Branches: refs/heads/branch-1.0 69e2726d4 -> 0759ee790 Typo fix: fetchting -> fetching Author: Andrew Ash Closes #680 from ash211/patch-3 and squashes the following commits: 9ce3746 [Andrew Ash] Typo fix: fetchting -> fetching (cherry picked from commit d00981a95185229fd1594d5c030a00f219fb1a14) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0759ee79 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0759ee79 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0759ee79 Branch: refs/heads/branch-1.0 Commit: 0759ee790527f61bf9f4bcef4aa0befa1d430370 Parents: 69e2726 Author: Andrew Ash Authored: Wed May 7 17:24:49 2014 -0400 Committer: Reynold Xin Committed: Wed May 7 17:29:35 2014 -0400 -- make-distribution.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0759ee79/make-distribution.sh -- diff --git a/make-distribution.sh b/make-distribution.sh index ebcd8c7..759e555 100755 --- a/make-distribution.sh +++ b/make-distribution.sh @@ -189,7 +189,7 @@ if [ "$SPARK_TACHYON" == "true" ]; then TMPD=`mktemp -d 2>/dev/null || mktemp -d -t 'disttmp'` pushd $TMPD > /dev/null - echo "Fetchting tachyon tgz" + echo "Fetching tachyon tgz" wget "$TACHYON_URL" tar xf "tachyon-${TACHYON_VERSION}-bin.tar.gz"
git commit: SPARK-1544 Add support for deep decision trees.
Repository: spark Updated Branches: refs/heads/master 0c19bb161 -> f269b016a SPARK-1544 Add support for deep decision trees. @etrain and I came with a PR for arbitrarily deep decision trees at the cost of multiple passes over the data at deep tree levels. To summarize: 1) We take a parameter that indicates the amount of memory users want to reserve for computation on each worker (and 2x that at the driver). 2) Using that information, we calculate two things - the maximum depth to which we train as usual (which is, implicitly, the maximum number of nodes we want to train in parallel), and the size of the groups we should use in the case where we exceed this depth. cc: @atalwalkar, @hirakendu, @mengxr Author: Manish Amde Author: manishamde Author: Evan Sparks Closes #475 from manishamde/deep_tree and squashes the following commits: 968ca9d [Manish Amde] merged master 7fc9545 [Manish Amde] added docs ce004a1 [Manish Amde] minor formatting b27ad2c [Manish Amde] formatting 426bb28 [Manish Amde] programming guide blurb 8053fed [Manish Amde] more formatting 5eca9e4 [Manish Amde] grammar 4731cda [Manish Amde] formatting 5e82202 [Manish Amde] added documentation, fixed off by 1 error in max level calculation cbd9f14 [Manish Amde] modified scala.math to math dad9652 [Manish Amde] removed unused imports e0426ee [Manish Amde] renamed parameter 718506b [Manish Amde] added unit test 1517155 [Manish Amde] updated documentation 9dbdabe [Manish Amde] merge from master 719d009 [Manish Amde] updating user documentation fecf89a [manishamde] Merge pull request #6 from etrain/deep_tree 0287772 [Evan Sparks] Fixing scalastyle issue. 2f1e093 [Manish Amde] minor: added doc for maxMemory parameter 2f6072c [manishamde] Merge pull request #5 from etrain/deep_tree abc5a23 [Evan Sparks] Parameterizing max memory. 50b143a [Manish Amde] adding support for very deep trees Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f269b016 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f269b016 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f269b016 Branch: refs/heads/master Commit: f269b016acb17b24d106dc2b32a1be389489bb01 Parents: 0c19bb1 Author: Manish Amde Authored: Wed May 7 17:08:38 2014 -0700 Committer: Patrick Wendell Committed: Wed May 7 17:08:38 2014 -0700 -- docs/mllib-decision-tree.md | 15 ++- .../examples/mllib/DecisionTreeRunner.scala | 2 +- .../apache/spark/mllib/tree/DecisionTree.scala | 103 +-- .../mllib/tree/configuration/Strategy.scala | 6 +- .../spark/mllib/tree/DecisionTreeSuite.scala| 84 +-- 5 files changed, 177 insertions(+), 33 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f269b016/docs/mllib-decision-tree.md -- diff --git a/docs/mllib-decision-tree.md b/docs/mllib-decision-tree.md index 296277e..acf0fef 100644 --- a/docs/mllib-decision-tree.md +++ b/docs/mllib-decision-tree.md @@ -93,17 +93,14 @@ The recursive tree construction is stopped at a node when one of the two conditi 1. The node depth is equal to the `maxDepth` training parameter 2. No split candidate leads to an information gain at the node. +### Max memory requirements + +For faster processing, the decision tree algorithm performs simultaneous histogram computations for all nodes at each level of the tree. This could lead to high memory requirements at deeper levels of the tree leading to memory overflow errors. To alleviate this problem, a 'maxMemoryInMB' training parameter is provided which specifies the maximum amount of memory at the workers (twice as much at the master) to be allocated to the histogram computation. The default value is conservatively chosen to be 128 MB to allow the decision algorithm to work in most scenarios. Once the memory requirements for a level-wise computation crosses the `maxMemoryInMB` threshold, the node training tasks at each subsequent level is split into smaller tasks. + ### Practical limitations -1. The tree implementation stores an `Array[Double]` of size *O(#features \* #splits \* 2^maxDepth)* - in memory for aggregating histograms over partitions. The current implementation might not scale - to very deep trees since the memory requirement grows exponentially with tree depth. -2. The implemented algorithm reads both sparse and dense data. However, it is not optimized for - sparse input. -3. Python is not supported in this release. - -We are planning to solve these problems in the near future. Please drop us a line if you encounter -any issues. +1. The implemented algorithm reads both sparse and dense data. However, it is not optimized for sparse input. +2. Python is not supported in this relea
git commit: [SPARK-1755] Respect SparkSubmit --name on YARN
Repository: spark Updated Branches: refs/heads/branch-1.0 ab912271a -> 666bebe63 [SPARK-1755] Respect SparkSubmit --name on YARN Right now, SparkSubmit ignores the `--name` flag for both yarn-client and yarn-cluster. This is a bug. In client mode, SparkSubmit treats `--name` as a [cluster config](https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala#L170) and does not propagate this to SparkContext. In cluster mode, SparkSubmit passes this flag to `org.apache.spark.deploy.yarn.Client`, which only uses it for the [YARN ResourceManager](https://github.com/apache/spark/blob/master/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/Client.scala#L80), but does not propagate this to SparkContext. This PR ensures that `spark.app.name` is always set if SparkSubmit receives the `--name` flag, which is what the usage promises. This makes it possible for applications to start a SparkContext with an empty conf `val sc = new SparkContext(new SparkConf)`, and inherit the app name from SparkSubmit. Tested both modes on a YARN cluster. Author: Andrew Or Closes #699 from andrewor14/yarn-app-name and squashes the following commits: 98f6a79 [Andrew Or] Fix tests dea932f [Andrew Or] Merge branch 'master' of github.com:apache/spark into yarn-app-name c86d9ca [Andrew Or] Respect SparkSubmit --name on YARN (cherry picked from commit 8b7841299439b7dc590b2f7e2339f24e8f3e19f6) Signed-off-by: Patrick Wendell Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/666bebe6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/666bebe6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/666bebe6 Branch: refs/heads/branch-1.0 Commit: 666bebe63ef30be80dd1496e5f9164dd4cdb2016 Parents: ab91227 Author: Andrew Or Authored: Thu May 8 20:45:29 2014 -0700 Committer: Patrick Wendell Committed: Thu May 8 20:45:37 2014 -0700 -- .../main/scala/org/apache/spark/deploy/SparkSubmit.scala | 9 + .../scala/org/apache/spark/deploy/SparkSubmitSuite.scala | 10 ++ 2 files changed, 11 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/666bebe6/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index e39723f..16de6f7 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -160,6 +160,7 @@ object SparkSubmit { // each deploy mode; we iterate through these below val options = List[OptionAssigner]( OptionAssigner(args.master, ALL_CLUSTER_MGRS, false, sysProp = "spark.master"), + OptionAssigner(args.name, ALL_CLUSTER_MGRS, false, sysProp = "spark.app.name"), OptionAssigner(args.driverExtraClassPath, STANDALONE | YARN, true, sysProp = "spark.driver.extraClassPath"), OptionAssigner(args.driverExtraJavaOptions, STANDALONE | YARN, true, @@ -167,7 +168,7 @@ object SparkSubmit { OptionAssigner(args.driverExtraLibraryPath, STANDALONE | YARN, true, sysProp = "spark.driver.extraLibraryPath"), OptionAssigner(args.driverMemory, YARN, true, clOption = "--driver-memory"), - OptionAssigner(args.name, YARN, true, clOption = "--name"), + OptionAssigner(args.name, YARN, true, clOption = "--name", sysProp = "spark.app.name"), OptionAssigner(args.queue, YARN, true, clOption = "--queue"), OptionAssigner(args.queue, YARN, false, sysProp = "spark.yarn.queue"), OptionAssigner(args.numExecutors, YARN, true, clOption = "--num-executors"), @@ -188,8 +189,7 @@ object SparkSubmit { OptionAssigner(args.jars, YARN, true, clOption = "--addJars"), OptionAssigner(args.files, LOCAL | STANDALONE | MESOS, false, sysProp = "spark.files"), OptionAssigner(args.files, LOCAL | STANDALONE | MESOS, true, sysProp = "spark.files"), - OptionAssigner(args.jars, LOCAL | STANDALONE | MESOS, false, sysProp = "spark.jars"), - OptionAssigner(args.name, LOCAL | STANDALONE | MESOS, false, sysProp = "spark.app.name") + OptionAssigner(args.jars, LOCAL | STANDALONE | MESOS, false, sysProp = "spark.jars") ) // For client mode make any added jars immediately visible on the classpath @@ -205,7 +205,8 @@ object SparkSubmit { (clusterManager & opt.clusterManager) != 0) { if (opt.clOption != null) { childArgs += (opt.clOption, opt.value) -} else if (opt.sysProp != null) { +} +if (opt.sysProp != null) { sysProps.put(opt.sysProp, opt.value) } }
git commit: Proposal: clarify Scala programming guide on caching ...
Repository: spark Updated Branches: refs/heads/master 25ad8f930 -> 48ba3b8cd Proposal: clarify Scala programming guide on caching ... ... with regards to saved map output. Wording taken partially from Matei Zaharia's email to the Spark user list. http://apache-spark-user-list.1001560.n3.nabble.com/performance-improvement-on-second-operation-without-caching-td5227.html Author: Ethan Jewett Closes #668 from esjewett/Doc-update and squashes the following commits: 11793ce [Ethan Jewett] Update based on feedback 171e670 [Ethan Jewett] Clarify Scala programming guide on caching ... Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/48ba3b8c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/48ba3b8c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/48ba3b8c Branch: refs/heads/master Commit: 48ba3b8cdc3bdc7c67bc465d1f047fa3f44d7085 Parents: 25ad8f9 Author: Ethan Jewett Authored: Tue May 6 20:50:08 2014 -0700 Committer: Patrick Wendell Committed: Tue May 6 20:50:08 2014 -0700 -- docs/scala-programming-guide.md | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/48ba3b8c/docs/scala-programming-guide.md -- diff --git a/docs/scala-programming-guide.md b/docs/scala-programming-guide.md index e7ceaa2..f25e9cc 100644 --- a/docs/scala-programming-guide.md +++ b/docs/scala-programming-guide.md @@ -145,7 +145,7 @@ RDDs support two types of operations: *transformations*, which create a new data All transformations in Spark are lazy, in that they do not compute their results right away. Instead, they just remember the transformations applied to some base dataset (e.g. a file). The transformations are only computed when an action requires a result to be returned to the driver program. This design enables Spark to run more efficiently -- for example, we can realize that a dataset created through `map` will be used in a `reduce` and return only the result of the `reduce` to the driver, rather than the larger mapped dataset. -By default, each transformed RDD is recomputed each time you run an action on it. However, you may also *persist* an RDD in memory using the `persist` (or `cache`) method, in which case Spark will keep the elements around on the cluster for much faster access the next time you query it. There is also support for persisting datasets on disk, or replicated across the cluster. The next section in this document describes these options. +By default, each transformed RDD may be recomputed each time you run an action on it. However, you may also *persist* an RDD in memory using the `persist` (or `cache`) method, in which case Spark will keep the elements around on the cluster for much faster access the next time you query it. There is also support for persisting datasets on disk, or replicated across the cluster. The next section in this document describes these options. The following tables list the transformations and actions currently supported (see also the [RDD API doc](api/scala/index.html#org.apache.spark.rdd.RDD) for details): @@ -279,8 +279,8 @@ it is computed in an action, it will be kept in memory on the nodes. The cache i if any partition of an RDD is lost, it will automatically be recomputed using the transformations that originally created it. -In addition, each RDD can be stored using a different *storage level*, allowing you, for example, to -persist the dataset on disk, or persist it in memory but as serialized Java objects (to save space), +In addition, each persisted RDD can be stored using a different *storage level*, allowing you, for example, +to persist the dataset on disk, or persist it in memory but as serialized Java objects (to save space), or replicate it across nodes, or store the data in off-heap memory in [Tachyon](http://tachyon-project.org/). These levels are chosen by passing a [`org.apache.spark.storage.StorageLevel`](api/scala/index.html#org.apache.spark.storage.StorageLevel) @@ -330,6 +330,8 @@ available storage levels is: +Spark sometimes automatically persists intermediate state from RDD operations, even without users calling persist() or cache(). In particular, if a shuffle happens when computing an RDD, Spark will keep the outputs from the map side of the shuffle on disk to avoid re-computing the entire dependency graph if an RDD is re-used. We still recommend users call persist() if they plan to re-use an RDD iteratively. + ### Which Storage Level to Choose? Spark's storage levels are meant to provide different trade-offs between memory usage and CPU
git commit: Update GradientDescentSuite.scala
Repository: spark Updated Branches: refs/heads/master 3188553f7 -> 0c19bb161 Update GradientDescentSuite.scala use more faster way to construct an array Author: baishuo(ç½ç¡) Closes #588 from baishuo/master and squashes the following commits: 45b95fb [baishuo(ç½ç¡)] Update GradientDescentSuite.scala c03b61c [baishuo(ç½ç¡)] Update GradientDescentSuite.scala b666d27 [baishuo(ç½ç¡)] Update GradientDescentSuite.scala Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0c19bb16 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0c19bb16 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0c19bb16 Branch: refs/heads/master Commit: 0c19bb161b9b2b96c0c55d3ea09e81fd798cbec0 Parents: 3188553 Author: baishuo(ç½ç¡) Authored: Wed May 7 16:02:55 2014 -0700 Committer: Patrick Wendell Committed: Wed May 7 16:02:55 2014 -0700 -- .../apache/spark/mllib/optimization/GradientDescentSuite.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0c19bb16/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala index c4b4334..8a16284 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala @@ -81,11 +81,11 @@ class GradientDescentSuite extends FunSuite with LocalSparkContext with ShouldMa // Add a extra variable consisting of all 1.0's for the intercept. val testData = GradientDescentSuite.generateGDInput(A, B, nPoints, 42) val data = testData.map { case LabeledPoint(label, features) => - label -> Vectors.dense(1.0, features.toArray: _*) + label -> Vectors.dense(1.0 +: features.toArray) } val dataRDD = sc.parallelize(data, 2).cache() -val initialWeightsWithIntercept = Vectors.dense(1.0, initialWeights: _*) +val initialWeightsWithIntercept = Vectors.dense(1.0 +: initialWeights.toArray) val (_, loss) = GradientDescent.runMiniBatchSGD( dataRDD, @@ -111,7 +111,7 @@ class GradientDescentSuite extends FunSuite with LocalSparkContext with ShouldMa // Add a extra variable consisting of all 1.0's for the intercept. val testData = GradientDescentSuite.generateGDInput(2.0, -1.5, 1, 42) val data = testData.map { case LabeledPoint(label, features) => - label -> Vectors.dense(1.0, features.toArray: _*) + label -> Vectors.dense(1.0 +: features.toArray) } val dataRDD = sc.parallelize(data, 2).cache()
git commit: Fixed streaming examples docs to use run-example instead of spark-submit
Repository: spark Updated Branches: refs/heads/master 69f750228 -> 68f28dabe Fixed streaming examples docs to use run-example instead of spark-submit Pretty self-explanatory Author: Tathagata Das Closes #722 from tdas/example-fix and squashes the following commits: 7839979 [Tathagata Das] Minor changes. 0673441 [Tathagata Das] Fixed java docs of java streaming example e687123 [Tathagata Das] Fixed scala style errors. 9b8d112 [Tathagata Das] Fixed streaming examples docs to use run-example instead of spark-submit. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/68f28dab Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/68f28dab Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/68f28dab Branch: refs/heads/master Commit: 68f28dabe9c7679be82e684385be216319beb610 Parents: 69f7502 Author: Tathagata Das Authored: Wed May 14 04:17:32 2014 -0700 Committer: Tathagata Das Committed: Wed May 14 04:17:32 2014 -0700 -- .../examples/streaming/JavaCustomReceiver.java | 13 ++--- .../examples/streaming/JavaFlumeEventCount.java | 6 +- .../examples/streaming/JavaKafkaWordCount.java | 6 +- .../streaming/JavaNetworkWordCount.java | 13 +++-- .../examples/streaming/ActorWordCount.scala | 6 +- .../examples/streaming/CustomReceiver.scala | 19 +++ .../examples/streaming/FlumeEventCount.scala| 9 ++- .../examples/streaming/HdfsWordCount.scala | 5 +- .../examples/streaming/KafkaWordCount.scala | 6 +- .../examples/streaming/MQTTWordCount.scala | 10 ++-- .../examples/streaming/NetworkWordCount.scala | 14 +++-- .../streaming/RecoverableNetworkWordCount.scala | 7 +-- .../streaming/StatefulNetworkWordCount.scala| 6 +- .../examples/streaming/TwitterPopularTags.scala | 22 +++- .../examples/streaming/ZeroMQWordCount.scala| 8 +-- .../clickstream/PageViewGenerator.scala | 10 ++-- .../streaming/clickstream/PageViewStream.scala | 7 ++- .../streaming/twitter/TwitterInputDStream.scala | 58 18 files changed, 130 insertions(+), 95 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/68f28dab/examples/src/main/java/org/apache/spark/examples/streaming/JavaCustomReceiver.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/streaming/JavaCustomReceiver.java b/examples/src/main/java/org/apache/spark/examples/streaming/JavaCustomReceiver.java index 7f558f3..5622df5 100644 --- a/examples/src/main/java/org/apache/spark/examples/streaming/JavaCustomReceiver.java +++ b/examples/src/main/java/org/apache/spark/examples/streaming/JavaCustomReceiver.java @@ -19,6 +19,7 @@ package org.apache.spark.examples.streaming; import com.google.common.collect.Lists; +import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; @@ -48,25 +49,23 @@ import java.util.regex.Pattern; * To run this on your local machine, you need to first run a Netcat server *`$ nc -lk ` * and then run the example - *`$ ./run org.apache.spark.examples.streaming.JavaCustomReceiver local[2] localhost ` + *`$ bin/run-example org.apache.spark.examples.streaming.JavaCustomReceiver localhost ` */ public class JavaCustomReceiver extends Receiver { private static final Pattern SPACE = Pattern.compile(" "); public static void main(String[] args) { -if (args.length < 3) { - System.err.println("Usage: JavaNetworkWordCount \n" + - "In local mode, should be 'local[n]' with n > 1"); +if (args.length < 2) { + System.err.println("Usage: JavaNetworkWordCount "); System.exit(1); } StreamingExamples.setStreamingLogLevels(); // Create the context with a 1 second batch size -JavaStreamingContext ssc = new JavaStreamingContext(args[0], "JavaNetworkWordCount", -new Duration(1000), System.getenv("SPARK_HOME"), -JavaStreamingContext.jarOfClass(JavaNetworkWordCount.class)); +SparkConf sparkConf = new SparkConf().setAppName("JavaCustomReceiver"); +JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, new Duration(1000)); // Create a input stream with the custom receiver on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') http://git-wip-us.apache.org/repos/asf/spark/blob/68f28dab/examples/src/main/java/org/apache/spark/examples/streaming/JavaFlumeEventCount.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/streaming/JavaFlumeEventCount.ja
git commit: Proposal: clarify Scala programming guide on caching ...
Repository: spark Updated Branches: refs/heads/branch-1.0 514ee93da -> 51e277557 Proposal: clarify Scala programming guide on caching ... ... with regards to saved map output. Wording taken partially from Matei Zaharia's email to the Spark user list. http://apache-spark-user-list.1001560.n3.nabble.com/performance-improvement-on-second-operation-without-caching-td5227.html Author: Ethan Jewett Closes #668 from esjewett/Doc-update and squashes the following commits: 11793ce [Ethan Jewett] Update based on feedback 171e670 [Ethan Jewett] Clarify Scala programming guide on caching ... (cherry picked from commit 48ba3b8cdc3bdc7c67bc465d1f047fa3f44d7085) Signed-off-by: Patrick Wendell Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/51e27755 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/51e27755 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/51e27755 Branch: refs/heads/branch-1.0 Commit: 51e27755750e896ac632f4a40b362bd580e21ced Parents: 514ee93 Author: Ethan Jewett Authored: Tue May 6 20:50:08 2014 -0700 Committer: Patrick Wendell Committed: Tue May 6 20:50:18 2014 -0700 -- docs/scala-programming-guide.md | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/51e27755/docs/scala-programming-guide.md -- diff --git a/docs/scala-programming-guide.md b/docs/scala-programming-guide.md index e7ceaa2..f25e9cc 100644 --- a/docs/scala-programming-guide.md +++ b/docs/scala-programming-guide.md @@ -145,7 +145,7 @@ RDDs support two types of operations: *transformations*, which create a new data All transformations in Spark are lazy, in that they do not compute their results right away. Instead, they just remember the transformations applied to some base dataset (e.g. a file). The transformations are only computed when an action requires a result to be returned to the driver program. This design enables Spark to run more efficiently -- for example, we can realize that a dataset created through `map` will be used in a `reduce` and return only the result of the `reduce` to the driver, rather than the larger mapped dataset. -By default, each transformed RDD is recomputed each time you run an action on it. However, you may also *persist* an RDD in memory using the `persist` (or `cache`) method, in which case Spark will keep the elements around on the cluster for much faster access the next time you query it. There is also support for persisting datasets on disk, or replicated across the cluster. The next section in this document describes these options. +By default, each transformed RDD may be recomputed each time you run an action on it. However, you may also *persist* an RDD in memory using the `persist` (or `cache`) method, in which case Spark will keep the elements around on the cluster for much faster access the next time you query it. There is also support for persisting datasets on disk, or replicated across the cluster. The next section in this document describes these options. The following tables list the transformations and actions currently supported (see also the [RDD API doc](api/scala/index.html#org.apache.spark.rdd.RDD) for details): @@ -279,8 +279,8 @@ it is computed in an action, it will be kept in memory on the nodes. The cache i if any partition of an RDD is lost, it will automatically be recomputed using the transformations that originally created it. -In addition, each RDD can be stored using a different *storage level*, allowing you, for example, to -persist the dataset on disk, or persist it in memory but as serialized Java objects (to save space), +In addition, each persisted RDD can be stored using a different *storage level*, allowing you, for example, +to persist the dataset on disk, or persist it in memory but as serialized Java objects (to save space), or replicate it across nodes, or store the data in off-heap memory in [Tachyon](http://tachyon-project.org/). These levels are chosen by passing a [`org.apache.spark.storage.StorageLevel`](api/scala/index.html#org.apache.spark.storage.StorageLevel) @@ -330,6 +330,8 @@ available storage levels is: +Spark sometimes automatically persists intermediate state from RDD operations, even without users calling persist() or cache(). In particular, if a shuffle happens when computing an RDD, Spark will keep the outputs from the map side of the shuffle on disk to avoid re-computing the entire dependency graph if an RDD is re-used. We still recommend users call persist() if they plan to re-use an RDD iteratively. + ### Which Storage Level to Choose? Spark's storage levels are meant to provide different trade-offs between memory usage and
git commit: Nicer logging for SecurityManager startup
Repository: spark Updated Branches: refs/heads/master ca4318686 -> 7f6f4a103 Nicer logging for SecurityManager startup Happy to open a jira ticket if you'd like to track one there. Author: Andrew Ash Closes #678 from ash211/SecurityManagerLogging and squashes the following commits: 2aa0b7a [Andrew Ash] Nicer logging for SecurityManager startup Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7f6f4a10 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7f6f4a10 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7f6f4a10 Branch: refs/heads/master Commit: 7f6f4a1035ae0c9fa2029fe991f621ca263d53e0 Parents: ca43186 Author: Andrew Ash Authored: Wed May 7 17:24:12 2014 -0400 Committer: Reynold Xin Committed: Wed May 7 17:24:12 2014 -0400 -- core/src/main/scala/org/apache/spark/SecurityManager.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7f6f4a10/core/src/main/scala/org/apache/spark/SecurityManager.scala -- diff --git a/core/src/main/scala/org/apache/spark/SecurityManager.scala b/core/src/main/scala/org/apache/spark/SecurityManager.scala index b4b0067..74aa441 100644 --- a/core/src/main/scala/org/apache/spark/SecurityManager.scala +++ b/core/src/main/scala/org/apache/spark/SecurityManager.scala @@ -146,8 +146,9 @@ private[spark] class SecurityManager(sparkConf: SparkConf) extends Logging { setViewAcls(defaultAclUsers, sparkConf.get("spark.ui.view.acls", "")) private val secretKey = generateSecretKey() - logInfo("SecurityManager, is authentication enabled: " + authOn + -" are ui acls enabled: " + uiAclsOn + " users with view permissions: " + viewAcls.toString()) + logInfo("SecurityManager: authentication " + (if (authOn) "enabled" else "disabled") + +"; ui acls " + (if (uiAclsOn) "enabled" else "disabled") + +"; users with view permissions: " + viewAcls.toString()) // Set our own authenticator to properly negotiate user/password for HTTP connections. // This is needed by the HTTP client fetching from the HttpServer. Put here so its
git commit: Use numpy directly for matrix multiply.
Repository: spark Updated Branches: refs/heads/branch-1.0 35aa2448a -> 010040fd0 Use numpy directly for matrix multiply. Using matrix multiply to compute XtX and XtY yields a 5-20x speedup depending on problem size. For example - the following takes 19s locally after this change vs. 5m21s before the change. (16x speedup). bin/pyspark examples/src/main/python/als.py local[8] 1000 1000 50 10 10 Author: Evan Sparks Closes #687 from etrain/patch-1 and squashes the following commits: e094dbc [Evan Sparks] Touching only diaganols on update. d1ab9b6 [Evan Sparks] Use numpy directly for matrix multiply. (cherry picked from commit 6ed7e2cd01955adfbb3960e2986b6d19eaee8717) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/010040fd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/010040fd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/010040fd Branch: refs/heads/branch-1.0 Commit: 010040fd0ddd38717e8747c884bc8b1cbf684d38 Parents: 35aa244 Author: Evan Sparks Authored: Thu May 8 00:24:36 2014 -0400 Committer: Reynold Xin Committed: Thu May 8 00:24:44 2014 -0400 -- examples/src/main/python/als.py | 15 +++ 1 file changed, 7 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/010040fd/examples/src/main/python/als.py -- diff --git a/examples/src/main/python/als.py b/examples/src/main/python/als.py index a77dfb2..33700ab 100755 --- a/examples/src/main/python/als.py +++ b/examples/src/main/python/als.py @@ -36,14 +36,13 @@ def rmse(R, ms, us): def update(i, vec, mat, ratings): uu = mat.shape[0] ff = mat.shape[1] -XtX = matrix(np.zeros((ff, ff))) -Xty = np.zeros((ff, 1)) - -for j in range(uu): -v = mat[j, :] -XtX += v.T * v -Xty += v.T * ratings[i, j] -XtX += np.eye(ff, ff) * LAMBDA * uu + +XtX = mat.T * mat +XtY = mat.T * ratings[i, :].T + +for j in range(ff): +XtX[j,j] += LAMBDA * uu + return np.linalg.solve(XtX, Xty) if __name__ == "__main__":
git commit: [SPARK-1754] [SQL] Add missing arithmetic DSL operations.
Repository: spark Updated Branches: refs/heads/master 5c5e7d580 -> 322b1808d [SPARK-1754] [SQL] Add missing arithmetic DSL operations. Add missing arithmetic DSL operations: `unary_-`, `%`. Author: Takuya UESHIN Closes #689 from ueshin/issues/SPARK-1754 and squashes the following commits: a09ef69 [Takuya UESHIN] Add also missing ! (not) operation. f73ae2c [Takuya UESHIN] Remove redundant tests. 5b3f087 [Takuya UESHIN] Add tests relating DSL operations. e09c5b8 [Takuya UESHIN] Add missing arithmetic DSL operations. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/322b1808 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/322b1808 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/322b1808 Branch: refs/heads/master Commit: 322b1808d21143dc323493203929488d69e8878a Parents: 5c5e7d5 Author: Takuya UESHIN Authored: Thu May 8 15:31:47 2014 -0700 Committer: Patrick Wendell Committed: Thu May 8 15:31:47 2014 -0700 -- .../org/apache/spark/sql/catalyst/dsl/package.scala | 4 .../expressions/ExpressionEvaluationSuite.scala | 16 +++- 2 files changed, 19 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/322b1808/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala index dc83485..78d3a1d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala @@ -57,10 +57,14 @@ package object dsl { trait ImplicitOperators { def expr: Expression +def unary_- = UnaryMinus(expr) +def unary_! = Not(expr) + def + (other: Expression) = Add(expr, other) def - (other: Expression) = Subtract(expr, other) def * (other: Expression) = Multiply(expr, other) def / (other: Expression) = Divide(expr, other) +def % (other: Expression) = Remainder(expr, other) def && (other: Expression) = And(expr, other) def || (other: Expression) = Or(expr, other) http://git-wip-us.apache.org/repos/asf/spark/blob/322b1808/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala index 91605d0..344d8a3 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala @@ -61,7 +61,7 @@ class ExpressionEvaluationSuite extends FunSuite { test("3VL Not") { notTrueTable.foreach { case (v, answer) => -val expr = Not(Literal(v, BooleanType)) +val expr = ! Literal(v, BooleanType) val result = expr.eval(null) if (result != answer) fail(s"$expr should not evaluate to $result, expected: $answer")} @@ -381,6 +381,13 @@ class ExpressionEvaluationSuite extends FunSuite { checkEvaluation(Add(c1, Literal(null, IntegerType)), null, row) checkEvaluation(Add(Literal(null, IntegerType), c2), null, row) checkEvaluation(Add(Literal(null, IntegerType), Literal(null, IntegerType)), null, row) + +checkEvaluation(-c1, -1, row) +checkEvaluation(c1 + c2, 3, row) +checkEvaluation(c1 - c2, -1, row) +checkEvaluation(c1 * c2, 2, row) +checkEvaluation(c1 / c2, 0, row) +checkEvaluation(c1 % c2, 1, row) } test("BinaryComparison") { @@ -395,6 +402,13 @@ class ExpressionEvaluationSuite extends FunSuite { checkEvaluation(LessThan(c1, Literal(null, IntegerType)), null, row) checkEvaluation(LessThan(Literal(null, IntegerType), c2), null, row) checkEvaluation(LessThan(Literal(null, IntegerType), Literal(null, IntegerType)), null, row) + +checkEvaluation(c1 < c2, true, row) +checkEvaluation(c1 <= c2, true, row) +checkEvaluation(c1 > c2, false, row) +checkEvaluation(c1 >= c2, false, row) +checkEvaluation(c1 === c2, false, row) +checkEvaluation(c1 !== c2, true, row) } }
git commit: SPARK-1833 - Have an empty SparkContext constructor.
Repository: spark Updated Branches: refs/heads/master a3315d7f4 -> 65533c7ec SPARK-1833 - Have an empty SparkContext constructor. This is nicer than relying on new SparkContext(new SparkConf()) Author: Patrick Wendell Closes #774 from pwendell/spark-context and squashes the following commits: ef9f12f [Patrick Wendell] SPARK-1833 - Have an empty SparkContext constructor. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/65533c7e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/65533c7e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/65533c7e Branch: refs/heads/master Commit: 65533c7ec03e7eedf5cd9756822863ab6f034ec9 Parents: a3315d7 Author: Patrick Wendell Authored: Wed May 14 12:53:30 2014 -0700 Committer: Patrick Wendell Committed: Wed May 14 12:53:30 2014 -0700 -- core/src/main/scala/org/apache/spark/SparkContext.scala | 6 ++ 1 file changed, 6 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/65533c7e/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 032b3d7..634c10c 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -67,6 +67,12 @@ class SparkContext(config: SparkConf) extends Logging { private[spark] var preferredNodeLocationData: Map[String, Set[SplitInfo]] = Map() /** + * Create a SparkContext that loads settings from system properties (for instance, when + * launching with ./bin/spark-submit). + */ + def this() = this(new SparkConf()) + + /** * :: DeveloperApi :: * Alternative constructor for setting preferred locations where Spark will create executors. *
git commit: SPARK-1569 Spark on Yarn, authentication broken by pr299
Repository: spark Updated Branches: refs/heads/master 520087224 -> 4bec84b6a SPARK-1569 Spark on Yarn, authentication broken by pr299 Pass the configs as java options since the executor needs to know before it registers whether to create the connection using authentication or not.We could see about passing only the authentication configs but for now I just had it pass them all. I also updating it to use a list to construct the command to make it the same as ClientBase and avoid any issues with spaces. Author: Thomas Graves Closes #649 from tgravescs/SPARK-1569 and squashes the following commits: 0178ab8 [Thomas Graves] add akka settings 22a8735 [Thomas Graves] Change to only path spark.auth* configs 8ccc1d4 [Thomas Graves] SPARK-1569 Spark on Yarn, authentication broken Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4bec84b6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4bec84b6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4bec84b6 Branch: refs/heads/master Commit: 4bec84b6a23e1e642708a70a6c7ef7b3d1df9b3e Parents: 5200872 Author: Thomas Graves Authored: Wed May 7 15:51:53 2014 -0700 Committer: Patrick Wendell Committed: Wed May 7 15:51:53 2014 -0700 -- .../deploy/yarn/ExecutorRunnableUtil.scala | 49 1 file changed, 30 insertions(+), 19 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4bec84b6/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnableUtil.scala -- diff --git a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnableUtil.scala b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnableUtil.scala index 96f8aa9..32f8861 100644 --- a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnableUtil.scala +++ b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnableUtil.scala @@ -21,7 +21,7 @@ import java.io.File import java.net.URI import scala.collection.JavaConversions._ -import scala.collection.mutable.HashMap +import scala.collection.mutable.{HashMap, ListBuffer} import org.apache.hadoop.fs.Path import org.apache.hadoop.yarn.api._ @@ -44,9 +44,9 @@ trait ExecutorRunnableUtil extends Logging { hostname: String, executorMemory: Int, executorCores: Int, - localResources: HashMap[String, LocalResource]) = { + localResources: HashMap[String, LocalResource]): List[String] = { // Extra options for the JVM -var JAVA_OPTS = "" +val JAVA_OPTS = ListBuffer[String]() // Set the JVM memory val executorMemoryString = executorMemory + "m" JAVA_OPTS += "-Xms" + executorMemoryString + " -Xmx" + executorMemoryString + " " @@ -56,10 +56,21 @@ trait ExecutorRunnableUtil extends Logging { JAVA_OPTS += opts } -JAVA_OPTS += " -Djava.io.tmpdir=" + - new Path(Environment.PWD.$(), YarnConfiguration.DEFAULT_CONTAINER_TEMP_DIR) + " " +JAVA_OPTS += "-Djava.io.tmpdir=" + + new Path(Environment.PWD.$(), YarnConfiguration.DEFAULT_CONTAINER_TEMP_DIR) JAVA_OPTS += ClientBase.getLog4jConfiguration(localResources) +// Certain configs need to be passed here because they are needed before the Executor +// registers with the Scheduler and transfers the spark configs. Since the Executor backend +// uses Akka to connect to the scheduler, the akka settings are needed as well as the +// authentication settings. +sparkConf.getAll. + filter { case (k, v) => k.startsWith("spark.auth") || k.startsWith("spark.akka") }. + foreach { case (k, v) => JAVA_OPTS += "-D" + k + "=" + "\\\"" + v + "\\\"" } + +sparkConf.getAkkaConf. + foreach { case (k, v) => JAVA_OPTS += "-D" + k + "=" + "\\\"" + v + "\\\"" } + // Commenting it out for now - so that people can refer to the properties if required. Remove // it once cpuset version is pushed out. // The context is, default gc for server class machines end up using all cores to do gc - hence @@ -85,25 +96,25 @@ trait ExecutorRunnableUtil extends Logging { } */ -val commands = List[String]( - Environment.JAVA_HOME.$() + "/bin/java" + - " -server " + +val commands = Seq(Environment.JAVA_HOME.$() + "/bin/java", + "-server", // Kill if OOM is raised - leverage yarn's failure handling to cause rescheduling. // Not killing the task leaves various aspects of the executor and (to some extent) the jvm in // an inconsistent state. // TODO: If the OOM is not recoverable by rescheduling it on different node, then do // 'something' to fail job ... akin to blacklisting trackers in mapred ? - " -XX:OnOutOfMemoryError='kill %p' " +
svn commit: r1592937 - in /spark: documentation.md site/documentation.html
Author: andrew Date: Wed May 7 03:45:54 2014 New Revision: 1592937 URL: http://svn.apache.org/r1592937 Log: adding more videos from past meetups to documentation page Modified: spark/documentation.md spark/site/documentation.html Modified: spark/documentation.md URL: http://svn.apache.org/viewvc/spark/documentation.md?rev=1592937&r1=1592936&r2=1592937&view=diff == --- spark/documentation.md (original) +++ spark/documentation.md Wed May 7 03:45:54 2014 @@ -49,18 +49,37 @@ See the http://www.youtube.com/ Meetup Talk Videos +In addition to the videos listed below, you can also view http://www.meetup.com/spark-users/files/";>all slides from Bay Area meetups here. + + .video-meta-info { +font-size: 0.95em; + } + - http://www.youtube.com/watch?v=NUQ-8to2XAk&list=PL-x35fyliRwiP3YteXbnhk0QGOtYLBT3a";>Spark 1.0 and Beyond - Patrick Wendell (http://files.meetup.com/3138542/Spark%201.0%20Meetup.ppt";>slides) by Patrick Wendell, at Cisco in San Jose, 2014-04-23 + http://www.youtube.com/watch?v=NUQ-8to2XAk&list=PL-x35fyliRwiP3YteXbnhk0QGOtYLBT3a";>Spark 1.0 and Beyond (http://files.meetup.com/3138542/Spark%201.0%20Meetup.ppt";>slides) by Patrick Wendell, at Cisco in San Jose, 2014-04-23 - http://www.youtube.com/watch?v=ju2OQEXqONU&list=PL-x35fyliRwiP3YteXbnhk0QGOtYLBT3a";>Adding Native SQL Support to Spark with Catalyst (http://files.meetup.com/3138542/Spark%20SQL%20Meetup%20-%204-8-2012.pdf";>slides) by Michael Armbrust, at Tagged in San Francisco, 2014-04-08 + http://www.youtube.com/watch?v=ju2OQEXqONU&list=PL-x35fyliRwiP3YteXbnhk0QGOtYLBT3a";>Adding Native SQL Support to Spark with Catalyst (http://files.meetup.com/3138542/Spark%20SQL%20Meetup%20-%204-8-2012.pdf";>slides) by Michael Armbrust, at Tagged in SF, 2014-04-08 - http://www.youtube.com/watch?v=MY0NkZY_tJw&list=PL-x35fyliRwiP3YteXbnhk0QGOtYLBT3a";>SparkR and GraphX (http://files.meetup.com/3138542/SparkR-meetup.pdf";>SparkR slides, http://files.meetup.com/3138542/graphx%40spark_meetup03_2014.pdf";>GraphX slides) by Shivaram Venkataraman & Dan Crankshaw, at SkyDeck in Berkeley, 2014-03-25 + http://www.youtube.com/watch?v=MY0NkZY_tJw&list=PL-x35fyliRwiP3YteXbnhk0QGOtYLBT3a";>SparkR and GraphX (slides: http://files.meetup.com/3138542/SparkR-meetup.pdf";>SparkR, http://files.meetup.com/3138542/graphx%40spark_meetup03_2014.pdf";>GraphX) by Shivaram Venkataraman & Dan Crankshaw, at SkyDeck in Berkeley, 2014-03-25 + + http://www.youtube.com/watch?v=5niXiiEX5pE&list=PL-x35fyliRwiP3YteXbnhk0QGOtYLBT3a";>Simple deployment w/ SIMR & Advanced Shark Analytics w/ TGFs (http://files.meetup.com/3138542/tgf.pptx";>slides) by Ali Ghodsi, at Huawei in Santa Clara, 2014-02-05 + + http://www.youtube.com/watch?v=C7gWtxelYNM&list=PL-x35fyliRwiP3YteXbnhk0QGOtYLBT3a";>Stores, Monoids & Dependency Injection - Abstractions for Spark (http://files.meetup.com/3138542/Abstractions%20for%20spark%20streaming%20-%20spark%20meetup%20presentation.pdf";>slides) by Ryan Weald, at Sharethrough in SF, 2014-01-17 + + https://www.youtube.com/watch?v=IxDnF_X4M-8";>Distributed Machine Learning using MLbase (http://files.meetup.com/3138542/sparkmeetup_8_6_13_final_reduced.pdf";>slides) by Evan Sparks & Ameet Talwalkar, at Twitter in SF, 2013-08-06 + + https://www.youtube.com/watch?v=vJQ2RZj9hqs";>GraphX Preview: Graph Analysis on Spark by Reynold Xin & Joseph Gonzalez, at Flurry in SF, 2013-07-02 + + http://www.youtube.com/watch?v=D1knCQZQQnw";>Deep Dive with Spark Streaming (http://www.slideshare.net/spark-project/deep-divewithsparkstreaming-tathagatadassparkmeetup20130617";>slides) by Tathagata Das, at Plug and Play in Sunnyvale, 2013-06-17 + + https://www.youtube.com/watch?v=cAZ624-69PQ";>Tachyon and Shark update (slides: http://files.meetup.com/3138542/2013-05-09%20Shark%20%40%20Spark%20Meetup.pdf";>Shark, http://files.meetup.com/3138542/Tachyon_2013-05-09_Spark_Meetup.pdf";>Tachyon) by Ali Ghodsi, Haoyuan Li, Reynold Xin, Google Ventures, 2013-05-09 + + https://www.youtube.com/playlist?list=PLxwbieuTaYXmWTBovyyw2NibPfUaJk-h4";>Spark 0.7: Overview, pySpark, & Streaming by Matei Zaharia, Josh Rosen, Tathagata Das, at Conviva on 2013-02-21 + + https://www.youtube.com/watch?v=49Hr5xZyTEA";>Introduction to Spark Internals (http://files.meetup.com/3138542/dev-meetup-dec-2012.pptx";>slides) by Matei Zaharia, at Yahoo in Sunnyvale, 2012-12-18 - http://www.youtube.com/watch?v=5niXiiEX5pE&list=PL-x35fyliRwiP3YteXbnhk0QGOtYLBT3a";>Simple deployment with SIMR and Advanced Shark Analytics with TGFs (http://files.meetup.com/3138542/tgf.pptx";>slides) by Ali Ghodsi, at Huawei in Santa Clara, 2014-02-05 - http://www.youtube.com/watch?v=C7gWtxelYNM&list=PL-x35fyliRwiP3YteXbnhk0QGOtYLBT3a";>Stores, Monoids and Dependency Injection - Abstractions for Spark (http://files.meetup.com/3138542/Abstractions%20for%20spark%20streaming%20-%20spark%20meetup%2
[1/2] SPARK-1565, update examples to be used with spark-submit script.
Repository: spark Updated Branches: refs/heads/master 19c8fb02b -> 44dd57fb6 http://git-wip-us.apache.org/repos/asf/spark/blob/44dd57fb/examples/src/main/scala/org/apache/spark/examples/streaming/MQTTWordCount.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/MQTTWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/MQTTWordCount.scala index 47bf1e5..3a10daa 100644 --- a/examples/src/main/scala/org/apache/spark/examples/streaming/MQTTWordCount.scala +++ b/examples/src/main/scala/org/apache/spark/examples/streaming/MQTTWordCount.scala @@ -24,6 +24,7 @@ import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.StreamingContext._ import org.apache.spark.streaming.mqtt._ +import org.apache.spark.SparkConf /** * A simple Mqtt publisher for demonstration purposes, repeatedly publishes @@ -64,7 +65,6 @@ object MQTTPublisher { } } -// scalastyle:off /** * A sample wordcount with MqttStream stream * @@ -74,30 +74,28 @@ object MQTTPublisher { * Eclipse paho project provides Java library for Mqtt Client http://www.eclipse.org/paho/ * Example Java code for Mqtt Publisher and Subscriber can be found here * https://bitbucket.org/mkjinesh/mqttclient - * Usage: MQTTWordCount - * In local mode, should be 'local[n]' with n > 1 - *and describe where Mqtt publisher is running. + * Usage: MQTTWordCount +\ *and describe where Mqtt publisher is running. * * To run this example locally, you may run publisher as - *`$ ./bin/run-example org.apache.spark.examples.streaming.MQTTPublisher tcp://localhost:1883 foo` + *`$ ./bin/spark-submit examples.jar \ + *--class org.apache.spark.examples.streaming.MQTTPublisher tcp://localhost:1883 foo` * and run the example as - *`$ ./bin/run-example org.apache.spark.examples.streaming.MQTTWordCount local[2] tcp://localhost:1883 foo` + *`$ ./bin/spark-submit examples.jar \ + *--class org.apache.spark.examples.streaming.MQTTWordCount tcp://localhost:1883 foo` */ -// scalastyle:on object MQTTWordCount { def main(args: Array[String]) { -if (args.length < 3) { +if (args.length < 2) { System.err.println( -"Usage: MQTTWordCount " + - " In local mode, should be 'local[n]' with n > 1") +"Usage: MQTTWordCount ") System.exit(1) } -val Seq(master, brokerUrl, topic) = args.toSeq - -val ssc = new StreamingContext(master, "MqttWordCount", Seconds(2), System.getenv("SPARK_HOME"), -StreamingContext.jarOfClass(this.getClass).toSeq) +val Seq(brokerUrl, topic) = args.toSeq +val sparkConf = new SparkConf().setAppName("MQTTWordCount") +val ssc = new StreamingContext(sparkConf, Seconds(2)) val lines = MQTTUtils.createStream(ssc, brokerUrl, topic, StorageLevel.MEMORY_ONLY_SER_2) val words = lines.flatMap(x => x.toString.split(" ")) http://git-wip-us.apache.org/repos/asf/spark/blob/44dd57fb/examples/src/main/scala/org/apache/spark/examples/streaming/NetworkWordCount.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/NetworkWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/NetworkWordCount.scala index acfe9a4..ad7a199 100644 --- a/examples/src/main/scala/org/apache/spark/examples/streaming/NetworkWordCount.scala +++ b/examples/src/main/scala/org/apache/spark/examples/streaming/NetworkWordCount.scala @@ -17,41 +17,38 @@ package org.apache.spark.examples.streaming +import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.StreamingContext._ import org.apache.spark.storage.StorageLevel -// scalastyle:off /** * Counts words in text encoded with UTF8 received from the network every second. * - * Usage: NetworkWordCount - *is the Spark master URL. In local mode, should be 'local[n]' with n > 1. - *and describe the TCP server that Spark Streaming would connect to receive data. + * Usage: NetworkWordCount + * and describe the TCP server that Spark Streaming would connect to receive data. * * To run this on your local machine, you need to first run a Netcat server *`$ nc -lk ` * and then run the example - *`$ ./bin/run-example org.apache.spark.examples.streaming.NetworkWordCount local[2] localhost ` + *`$ ./bin/spark-submit examples.jar \ + *--class org.apache.spark.examples.streaming.NetworkWordCount localhost ` */ -// scalastyle:on object NetworkWordCount { def main(args: Array[String]) { -if (args.length < 3) { - System.err.println("Usage: NetworkWordCount \n" + -"In local mode, should be 'local[n]' with n > 1") +if (args.length < 2) {
git commit: SPARK-1668: Add implicit preference as an option to examples/MovieLensALS
Repository: spark Updated Branches: refs/heads/branch-1.0 c7b27043a -> 35aa2448a SPARK-1668: Add implicit preference as an option to examples/MovieLensALS Add --implicitPrefs as an command-line option to the example app MovieLensALS under examples/ Author: Sandeep Closes #597 from techaddict/SPARK-1668 and squashes the following commits: 8b371dc [Sandeep] Second Pass on reviews by mengxr eca9d37 [Sandeep] based on mengxr's suggestions 937e54c [Sandeep] Changes 5149d40 [Sandeep] Changes based on review 1dd7657 [Sandeep] use mean() 42444d7 [Sandeep] Based on Suggestions by mengxr e3082fa [Sandeep] SPARK-1668: Add implicit preference as an option to examples/MovieLensALS Add --implicitPrefs as an command-line option to the example app MovieLensALS under examples/ (cherry picked from commit 108c4c16cc82af2e161d569d2c23849bdbf4aadb) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/35aa2448 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/35aa2448 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/35aa2448 Branch: refs/heads/branch-1.0 Commit: 35aa2448ab2f02e822aeef0fbfacf297f0ca39ec Parents: c7b2704 Author: Sandeep Authored: Thu May 8 00:15:05 2014 -0400 Committer: Reynold Xin Committed: Thu May 8 00:15:15 2014 -0400 -- .../spark/examples/mllib/MovieLensALS.scala | 55 1 file changed, 46 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/35aa2448/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala index 703f022..0e4447e 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala @@ -43,7 +43,8 @@ object MovieLensALS { kryo: Boolean = false, numIterations: Int = 20, lambda: Double = 1.0, - rank: Int = 10) + rank: Int = 10, + implicitPrefs: Boolean = false) def main(args: Array[String]) { val defaultParams = Params() @@ -62,6 +63,9 @@ object MovieLensALS { opt[Unit]("kryo") .text(s"use Kryo serialization") .action((_, c) => c.copy(kryo = true)) + opt[Unit]("implicitPrefs") +.text("use implicit preference") +.action((_, c) => c.copy(implicitPrefs = true)) arg[String]("") .required() .text("input paths to a MovieLens dataset of ratings") @@ -88,7 +92,25 @@ object MovieLensALS { val ratings = sc.textFile(params.input).map { line => val fields = line.split("::") - Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble) + if (params.implicitPrefs) { +/* + * MovieLens ratings are on a scale of 1-5: + * 5: Must see + * 4: Will enjoy + * 3: It's okay + * 2: Fairly bad + * 1: Awful + * So we should not recommend a movie if the predicted rating is less than 3. + * To map ratings to confidence scores, we use + * 5 -> 2.5, 4 -> 1.5, 3 -> 0.5, 2 -> -0.5, 1 -> -1.5. This mappings means unobserved + * entries are generally between It's okay and Fairly bad. + * The semantics of 0 in this expanded world of non-positive weights + * are "the same as never having interacted at all". + */ +Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble - 2.5) + } else { +Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble) + } }.cache() val numRatings = ratings.count() @@ -99,7 +121,18 @@ object MovieLensALS { val splits = ratings.randomSplit(Array(0.8, 0.2)) val training = splits(0).cache() -val test = splits(1).cache() +val test = if (params.implicitPrefs) { + /* + * 0 means "don't know" and positive values mean "confident that the prediction should be 1". + * Negative values means "confident that the prediction should be 0". + * We have in this case used some kind of weighted RMSE. The weight is the absolute value of + * the confidence. The error is the difference between prediction and either 1 or 0, + * depending on whether r is positive or negative. + */ + splits(1).map(x => Rating(x.user, x.product, if (x.rating > 0) 1.0 else 0.0)) +} else { + splits(1) +}.cache() val numTraining = training.count() val numTest = test.count() @@ -111,9 +144,10 @@ object MovieLensALS { .setRank(params.rank) .setIterations(params.numIterations) .setLambda
git commit: SPARK-1770: Revert accidental(?) fix
Repository: spark Updated Branches: refs/heads/branch-1.0 80f292a21 -> 8202276c9 SPARK-1770: Revert accidental(?) fix Looks like this change was accidentally committed here: https://github.com/apache/spark/commit/06b15baab25951d124bbe6b64906f4139e037deb but the change does not show up in the PR itself (#704). Other than not intending to go in with that PR, this also broke the test JavaAPISuite.repartition. Author: Aaron Davidson Closes #716 from aarondav/shufflerand and squashes the following commits: b1cf70b [Aaron Davidson] SPARK-1770: Revert accidental(?) fix (cherry picked from commit 59577df14c06417676a9ffdd599f5713c448e299) Signed-off-by: Aaron Davidson Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8202276c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8202276c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8202276c Branch: refs/heads/branch-1.0 Commit: 8202276c916879eeb64e2b5591aa0faf5b0172bd Parents: 80f292a Author: Aaron Davidson Authored: Fri May 9 14:51:34 2014 -0700 Committer: Aaron Davidson Committed: Fri May 9 14:52:13 2014 -0700 -- core/src/main/scala/org/apache/spark/rdd/RDD.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8202276c/core/src/main/scala/org/apache/spark/rdd/RDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index 9d8d804..a1ca612 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -330,9 +330,9 @@ abstract class RDD[T: ClassTag]( if (shuffle) { // include a shuffle step so that our upstream tasks are still distributed new CoalescedRDD( -new ShuffledRDD[Int, T, (Int, T)](map(x => (Utils.random.nextInt(), x)), +new ShuffledRDD[T, Null, (T, Null)](map(x => (x, null)), new HashPartitioner(numPartitions)), -numPartitions).values +numPartitions).keys } else { new CoalescedRDD(this, numPartitions) }
git commit: Converted bang to ask to avoid scary warning when a block is removed
Repository: spark Updated Branches: refs/heads/branch-1.0 1d56cd544 -> b8c17e392 Converted bang to ask to avoid scary warning when a block is removed Removing a block through the blockmanager gave a scary warning messages in the driver. ``` 2014-05-08 20:16:19,172 WARN BlockManagerMasterActor: Got unknown message: true 2014-05-08 20:16:19,172 WARN BlockManagerMasterActor: Got unknown message: true 2014-05-08 20:16:19,172 WARN BlockManagerMasterActor: Got unknown message: true ``` This is because the [BlockManagerSlaveActor](https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveActor.scala#L44) would send back an acknowledgement ("true"). But the BlockManagerMasterActor would have sent the RemoveBlock message as a send, not as ask(), so would reject the receiver "true" as a unknown message. @pwendell Author: Tathagata Das Closes #708 from tdas/bm-fix and squashes the following commits: ed4ef15 [Tathagata Das] Converted bang to ask to avoid scary warning when a block is removed. (cherry picked from commit 32868f31f88aebd580ab9329dc51a30c26af7a74) Signed-off-by: Tathagata Das Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b8c17e39 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b8c17e39 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b8c17e39 Branch: refs/heads/branch-1.0 Commit: b8c17e3928d070d4757d44995516b8872196e5c9 Parents: 1d56cd5 Author: Tathagata Das Authored: Thu May 8 22:34:08 2014 -0700 Committer: Tathagata Das Committed: Thu May 8 22:34:21 2014 -0700 -- .../scala/org/apache/spark/storage/BlockManagerMasterActor.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b8c17e39/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala -- diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala index 98fa0df..6aed322 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala @@ -250,7 +250,7 @@ class BlockManagerMasterActor(val isLocal: Boolean, conf: SparkConf, listenerBus // Remove the block from the slave's BlockManager. // Doesn't actually wait for a confirmation and the message might get lost. // If message loss becomes frequent, we should add retry logic here. - blockManager.get.slaveActor ! RemoveBlock(blockId) + blockManager.get.slaveActor.ask(RemoveBlock(blockId))(akkaTimeout) } } }
git commit: [SPARK-1644] The org.datanucleus:* should not be packaged into spark-assembly-*.jar
Repository: spark Updated Branches: refs/heads/branch-1.0 adf8cdd0b -> 2a878dab6 [SPARK-1644] The org.datanucleus:* should not be packaged into spark-assembly-*.jar Author: witgo Closes #688 from witgo/SPARK-1644 and squashes the following commits: 56ad6ac [witgo] review commit 87c03e4 [witgo] Merge branch 'master' of https://github.com/apache/spark into SPARK-1644 6ffa7e4 [witgo] review commit a597414 [witgo] The org.datanucleus:* should not be packaged into spark-assembly-*.jar (cherry picked from commit 561510867a1b79beef57acf9df65c9f88481435d) Signed-off-by: Patrick Wendell Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2a878dab Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2a878dab Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2a878dab Branch: refs/heads/branch-1.0 Commit: 2a878dab6a04e39c1ff50a322a61abbc9a08d5db Parents: adf8cdd Author: witgo Authored: Sat May 10 10:15:04 2014 -0700 Committer: Patrick Wendell Committed: Sat May 10 10:15:16 2014 -0700 -- assembly/pom.xml | 1 + project/SparkBuild.scala | 11 ++- 2 files changed, 7 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2a878dab/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index 4a5df66..208794f 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -96,6 +96,7 @@ *:* +org.datanucleus:* META-INF/*.SF META-INF/*.DSA META-INF/*.RSA http://git-wip-us.apache.org/repos/asf/spark/blob/2a878dab/project/SparkBuild.scala -- diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 79c1972..d0b5409 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -579,12 +579,13 @@ object SparkBuild extends Build { def extraAssemblySettings() = Seq( test in assembly := {}, mergeStrategy in assembly := { - case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard - case m if m.toLowerCase.matches("meta-inf.*\\.sf$") => MergeStrategy.discard - case "log4j.properties" => MergeStrategy.discard + case PathList("org", "datanucleus", xs @ _*) => MergeStrategy.discard + case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard + case m if m.toLowerCase.matches("meta-inf.*\\.sf$") => MergeStrategy.discard + case "log4j.properties" => MergeStrategy.discard case m if m.toLowerCase.startsWith("meta-inf/services/") => MergeStrategy.filterDistinctLines - case "reference.conf" => MergeStrategy.concat - case _ => MergeStrategy.first + case "reference.conf"=> MergeStrategy.concat + case _ => MergeStrategy.first } )
git commit: [SPARK-1460] Returning SchemaRDD instead of normal RDD on Set operations...
Repository: spark Updated Branches: refs/heads/branch-1.0 756c96939 -> da9f9e05b [SPARK-1460] Returning SchemaRDD instead of normal RDD on Set operations... ... that do not change schema Author: Kan Zhang Closes #448 from kanzhang/SPARK-1460 and squashes the following commits: 111e388 [Kan Zhang] silence MiMa errors in EdgeRDD and VertexRDD 91dc787 [Kan Zhang] Taking into account newly added Ordering param 79ed52a [Kan Zhang] [SPARK-1460] Returning SchemaRDD on Set operations that do not change schema (cherry picked from commit 967635a2425a769b932eea0984fe697d6721cab0) Signed-off-by: Patrick Wendell Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/da9f9e05 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/da9f9e05 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/da9f9e05 Branch: refs/heads/branch-1.0 Commit: da9f9e05b47f5745c09377de15eccca131f07d51 Parents: 756c969 Author: Kan Zhang Authored: Wed May 7 09:41:31 2014 -0700 Committer: Patrick Wendell Committed: Wed May 7 09:41:43 2014 -0700 -- .../main/scala/org/apache/spark/rdd/RDD.scala | 10 +- .../scala/org/apache/spark/graphx/EdgeRDD.scala | 10 +- .../org/apache/spark/graphx/VertexRDD.scala | 10 +- project/MimaBuild.scala | 2 + python/pyspark/sql.py | 29 .../scala/org/apache/spark/sql/SchemaRDD.scala | 67 - .../spark/sql/api/java/JavaSchemaRDD.scala | 140 +++ 7 files changed, 246 insertions(+), 22 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/da9f9e05/core/src/main/scala/org/apache/spark/rdd/RDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index 3b3524f..a1ca612 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -128,7 +128,7 @@ abstract class RDD[T: ClassTag]( @transient var name: String = null /** Assign a name to this RDD */ - def setName(_name: String): RDD[T] = { + def setName(_name: String): this.type = { name = _name this } @@ -138,7 +138,7 @@ abstract class RDD[T: ClassTag]( * it is computed. This can only be used to assign a new storage level if the RDD does not * have a storage level set yet.. */ - def persist(newLevel: StorageLevel): RDD[T] = { + def persist(newLevel: StorageLevel): this.type = { // TODO: Handle changes of StorageLevel if (storageLevel != StorageLevel.NONE && newLevel != storageLevel) { throw new UnsupportedOperationException( @@ -152,10 +152,10 @@ abstract class RDD[T: ClassTag]( } /** Persist this RDD with the default storage level (`MEMORY_ONLY`). */ - def persist(): RDD[T] = persist(StorageLevel.MEMORY_ONLY) + def persist(): this.type = persist(StorageLevel.MEMORY_ONLY) /** Persist this RDD with the default storage level (`MEMORY_ONLY`). */ - def cache(): RDD[T] = persist() + def cache(): this.type = persist() /** * Mark the RDD as non-persistent, and remove all blocks for it from memory and disk. @@ -163,7 +163,7 @@ abstract class RDD[T: ClassTag]( * @param blocking Whether to block until all blocks are deleted. * @return This RDD. */ - def unpersist(blocking: Boolean = true): RDD[T] = { + def unpersist(blocking: Boolean = true): this.type = { logInfo("Removing RDD " + id + " from persistence list") sc.unpersistRDD(id, blocking) storageLevel = StorageLevel.NONE http://git-wip-us.apache.org/repos/asf/spark/blob/da9f9e05/graphx/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala -- diff --git a/graphx/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala index 6d04bf7..fa78ca9 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala @@ -51,18 +51,12 @@ class EdgeRDD[@specialized ED: ClassTag]( override def collect(): Array[Edge[ED]] = this.map(_.copy()).collect() - override def persist(newLevel: StorageLevel): EdgeRDD[ED] = { + override def persist(newLevel: StorageLevel): this.type = { partitionsRDD.persist(newLevel) this } - /** Persist this RDD with the default storage level (`MEMORY_ONLY`). */ - override def persist(): EdgeRDD[ED] = persist(StorageLevel.MEMORY_ONLY) - - /** Persist this RDD with the default storage level (`MEMORY_ONLY`). */ - override def cache(): EdgeRDD[ED] = persist() - - override def unpersist(blocking: Boolean = true): EdgeRDD[ED] = { + override def unpersist(blocking: Bool
git commit: Bug fix of sparse vector conversion
Repository: spark Updated Branches: refs/heads/master 910a13b3c -> 191279ce4 Bug fix of sparse vector conversion Fixed a small bug caused by the inconsistency of index/data array size and vector length. Author: Funes Author: funes Closes #661 from funes/bugfix and squashes the following commits: edb2b9d [funes] remove unused import 75dced3 [Funes] update test case d129a66 [Funes] Add test for sparse breeze by vector builder 64e7198 [Funes] Copy data only when necessary b85806c [Funes] Bug fix of sparse vector conversion Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/191279ce Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/191279ce Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/191279ce Branch: refs/heads/master Commit: 191279ce4edb940821d11a6b25cd33c8ad0af054 Parents: 910a13b Author: Funes Authored: Thu May 8 17:54:10 2014 -0700 Committer: Patrick Wendell Committed: Thu May 8 17:54:10 2014 -0700 -- .../main/scala/org/apache/spark/mllib/linalg/Vectors.scala | 6 +- .../spark/mllib/linalg/BreezeVectorConversionSuite.scala| 9 + 2 files changed, 14 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/191279ce/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index 7cdf6bd..84d2239 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -136,7 +136,11 @@ object Vectors { new DenseVector(v.toArray) // Can't use underlying array directly, so make a new one } case v: BSV[Double] => -new SparseVector(v.length, v.index, v.data) +if (v.index.length == v.used) { + new SparseVector(v.length, v.index, v.data) +} else { + new SparseVector(v.length, v.index.slice(0, v.used), v.data.slice(0, v.used)) +} case v: BV[_] => sys.error("Unsupported Breeze vector type: " + v.getClass.getName) } http://git-wip-us.apache.org/repos/asf/spark/blob/191279ce/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeVectorConversionSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeVectorConversionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeVectorConversionSuite.scala index aacaa30..8abdac7 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeVectorConversionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeVectorConversionSuite.scala @@ -55,4 +55,13 @@ class BreezeVectorConversionSuite extends FunSuite { assert(vec.indices.eq(indices), "should not copy data") assert(vec.values.eq(values), "should not copy data") } + + test("sparse breeze with partially-used arrays to vector") { +val activeSize = 3 +val breeze = new BSV[Double](indices, values, activeSize, n) +val vec = Vectors.fromBreeze(breeze).asInstanceOf[SparseVector] +assert(vec.size === n) +assert(vec.indices === indices.slice(0, activeSize)) +assert(vec.values === values.slice(0, activeSize)) + } }
git commit: SPARK-1833 - Have an empty SparkContext constructor.
Repository: spark Updated Branches: refs/heads/branch-1.0 530bdf7d4 -> 8e13ab2fe SPARK-1833 - Have an empty SparkContext constructor. This is nicer than relying on new SparkContext(new SparkConf()) Author: Patrick Wendell Closes #774 from pwendell/spark-context and squashes the following commits: ef9f12f [Patrick Wendell] SPARK-1833 - Have an empty SparkContext constructor. (cherry picked from commit 65533c7ec03e7eedf5cd9756822863ab6f034ec9) Signed-off-by: Patrick Wendell Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8e13ab2f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8e13ab2f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8e13ab2f Branch: refs/heads/branch-1.0 Commit: 8e13ab2fe25d2fd50ee84a42f0f2d248432c7734 Parents: 530bdf7 Author: Patrick Wendell Authored: Wed May 14 12:53:30 2014 -0700 Committer: Patrick Wendell Committed: Wed May 14 12:53:42 2014 -0700 -- core/src/main/scala/org/apache/spark/SparkContext.scala | 6 ++ 1 file changed, 6 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8e13ab2f/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 032b3d7..634c10c 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -67,6 +67,12 @@ class SparkContext(config: SparkConf) extends Logging { private[spark] var preferredNodeLocationData: Map[String, Set[SplitInfo]] = Map() /** + * Create a SparkContext that loads settings from system properties (for instance, when + * launching with ./bin/spark-submit). + */ + def this() = this(new SparkConf()) + + /** * :: DeveloperApi :: * Alternative constructor for setting preferred locations where Spark will create executors. *
git commit: SPARK-1829 Sub-second durations shouldn't round to "0 s"
Repository: spark Updated Branches: refs/heads/branch-1.0 379f733e9 -> 530bdf7d4 SPARK-1829 Sub-second durations shouldn't round to "0 s" As "99 ms" up to 99 ms As "0.1 s" from 0.1 s up to 0.9 s https://issues.apache.org/jira/browse/SPARK-1829 Compare the first image to the second here: http://imgur.com/RaLEsSZ,7VTlgfo#0 Author: Andrew Ash Closes #768 from ash211/spark-1829 and squashes the following commits: 1c15b8e [Andrew Ash] SPARK-1829 Format sub-second durations more appropriately (cherry picked from commit a3315d7f4c7584dae2ee0aa33c6ec9e97b229b48) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/530bdf7d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/530bdf7d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/530bdf7d Branch: refs/heads/branch-1.0 Commit: 530bdf7d4bde2e90e1523e65b089559a2eddd793 Parents: 379f733 Author: Andrew Ash Authored: Wed May 14 12:01:14 2014 -0700 Committer: Reynold Xin Committed: Wed May 14 12:01:22 2014 -0700 -- core/src/main/scala/org/apache/spark/ui/UIUtils.scala | 6 ++ 1 file changed, 6 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/530bdf7d/core/src/main/scala/org/apache/spark/ui/UIUtils.scala -- diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala index a3d6a18..a43314f 100644 --- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala +++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala @@ -36,7 +36,13 @@ private[spark] object UIUtils extends Logging { def formatDate(timestamp: Long): String = dateFormat.get.format(new Date(timestamp)) def formatDuration(milliseconds: Long): String = { +if (milliseconds < 100) { + return "%d ms".format(milliseconds) +} val seconds = milliseconds.toDouble / 1000 +if (seconds < 1) { + return "%.1f s".format(seconds) +} if (seconds < 60) { return "%.0f s".format(seconds) }
git commit: Fix: sbt test throw an java.lang.OutOfMemoryError: PermGen space
Repository: spark Updated Branches: refs/heads/master 17f3075bc -> fde82c154 Fix: sbt test throw an java.lang.OutOfMemoryError: PermGen space Author: witgo Closes #773 from witgo/sbt_javaOptions and squashes the following commits: 26c7d38 [witgo] Improve sbt configuration Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fde82c15 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fde82c15 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fde82c15 Branch: refs/heads/master Commit: fde82c1549c78f1eebbb21ec34e60befbbff65f5 Parents: 17f3075 Author: witgo Authored: Wed May 14 11:19:26 2014 -0700 Committer: Reynold Xin Committed: Wed May 14 11:19:26 2014 -0700 -- .rat-excludes| 5 + project/SparkBuild.scala | 1 + 2 files changed, 6 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fde82c15/.rat-excludes -- diff --git a/.rat-excludes b/.rat-excludes index 5076695..6894678 100644 --- a/.rat-excludes +++ b/.rat-excludes @@ -43,3 +43,8 @@ test.out/* .*iml service.properties db.lck +build/* +dist/* +.*out +.*ipr +.*iws http://git-wip-us.apache.org/repos/asf/spark/blob/fde82c15/project/SparkBuild.scala -- diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 8d56b40..6adec55 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -183,6 +183,7 @@ object SparkBuild extends Build { javaOptions in Test += "-Dspark.testing=1", javaOptions in Test += "-Dsun.io.serialization.extendedDebugInfo=true", javaOptions in Test ++= System.getProperties.filter(_._1 startsWith "spark").map { case (k,v) => s"-D$k=$v" }.toSeq, +javaOptions in Test ++= "-Xmx3g -XX:PermSize=128M -XX:MaxNewSize=256m -XX:MaxPermSize=1g".split(" ").toSeq, javaOptions += "-Xmx3g", // Show full stack trace and duration in test cases. testOptions in Test += Tests.Argument("-oDF"),
git commit: Fix: sbt test throw an java.lang.OutOfMemoryError: PermGen space
Repository: spark Updated Branches: refs/heads/branch-1.0 e480bcfbd -> 379f733e9 Fix: sbt test throw an java.lang.OutOfMemoryError: PermGen space Author: witgo Closes #773 from witgo/sbt_javaOptions and squashes the following commits: 26c7d38 [witgo] Improve sbt configuration (cherry picked from commit fde82c1549c78f1eebbb21ec34e60befbbff65f5) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/379f733e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/379f733e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/379f733e Branch: refs/heads/branch-1.0 Commit: 379f733e988daf2f1cae4cdac2faf1c42998b2b5 Parents: e480bcf Author: witgo Authored: Wed May 14 11:19:26 2014 -0700 Committer: Reynold Xin Committed: Wed May 14 11:19:43 2014 -0700 -- .rat-excludes| 5 + project/SparkBuild.scala | 1 + 2 files changed, 6 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/379f733e/.rat-excludes -- diff --git a/.rat-excludes b/.rat-excludes index 5076695..6894678 100644 --- a/.rat-excludes +++ b/.rat-excludes @@ -43,3 +43,8 @@ test.out/* .*iml service.properties db.lck +build/* +dist/* +.*out +.*ipr +.*iws http://git-wip-us.apache.org/repos/asf/spark/blob/379f733e/project/SparkBuild.scala -- diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 2d10d89..9cec1be 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -183,6 +183,7 @@ object SparkBuild extends Build { javaOptions in Test += "-Dspark.testing=1", javaOptions in Test += "-Dsun.io.serialization.extendedDebugInfo=true", javaOptions in Test ++= System.getProperties.filter(_._1 startsWith "spark").map { case (k,v) => s"-D$k=$v" }.toSeq, +javaOptions in Test ++= "-Xmx3g -XX:PermSize=128M -XX:MaxNewSize=256m -XX:MaxPermSize=1g".split(" ").toSeq, javaOptions += "-Xmx3g", // Show full stack trace and duration in test cases. testOptions in Test += Tests.Argument("-oDF"),
git commit: [maven-release-plugin] prepare release v1.0.0-rc6
Repository: spark Updated Branches: refs/heads/branch-1.0 e8ca3970c -> 54133abdc [maven-release-plugin] prepare release v1.0.0-rc6 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/54133abd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/54133abd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/54133abd Branch: refs/heads/branch-1.0 Commit: 54133abdce0246f6643a1112a5204afb2c4caa82 Parents: e8ca397 Author: Patrick Wendell Authored: Wed May 14 17:50:33 2014 + Committer: Patrick Wendell Committed: Wed May 14 17:50:33 2014 + -- assembly/pom.xml | 6 +++--- bagel/pom.xml | 2 +- core/pom.xml | 2 +- examples/pom.xml | 2 +- external/flume/pom.xml| 2 +- external/kafka/pom.xml| 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml | 2 +- external/zeromq/pom.xml | 2 +- extras/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml| 2 +- mllib/pom.xml | 2 +- pom.xml | 4 ++-- repl/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- yarn/pom.xml | 2 +- yarn/stable/pom.xml | 2 +- 21 files changed, 24 insertions(+), 24 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/54133abd/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index abd8935..79b1b1f 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent -1.0.0-SNAPSHOT +1.0.0 ../pom.xml @@ -122,8 +122,8 @@ log4j.properties - - + + http://git-wip-us.apache.org/repos/asf/spark/blob/54133abd/bagel/pom.xml -- diff --git a/bagel/pom.xml b/bagel/pom.xml index 355f437..08932bb 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent -1.0.0-SNAPSHOT +1.0.0 ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/54133abd/core/pom.xml -- diff --git a/core/pom.xml b/core/pom.xml index bab50f5..3e22641 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent -1.0.0-SNAPSHOT +1.0.0 ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/54133abd/examples/pom.xml -- diff --git a/examples/pom.xml b/examples/pom.xml index 874bcd7..006757a 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent -1.0.0-SNAPSHOT +1.0.0 ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/54133abd/external/flume/pom.xml -- diff --git a/external/flume/pom.xml b/external/flume/pom.xml index 6aec215..3ba984e 100644 --- a/external/flume/pom.xml +++ b/external/flume/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent -1.0.0-SNAPSHOT +1.0.0 ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/54133abd/external/kafka/pom.xml -- diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml index 979eb0c..cb4dd47 100644 --- a/external/kafka/pom.xml +++ b/external/kafka/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent -1.0.0-SNAPSHOT +1.0.0 ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/54133abd/external/mqtt/pom.xml -- diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml index 7b2dc5b..b10916d 100644 --- a/external/mqtt/pom.xml +++ b/external/mqtt/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent -1.0.0-SNAPSHOT +1.0.0 ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/54133abd/external/twitter/pom.xml -- diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml index 5766d3a..f33b583 100644 --- a/external/twitter/pom.xml +++ b/external/twitter/
Git Push Summary
Repository: spark Updated Tags: refs/tags/v1.0.0-rc6 [created] aab03f5f9
git commit: [maven-release-plugin] prepare for next development iteration
Repository: spark Updated Branches: refs/heads/branch-1.0 54133abdc -> e480bcfbd [maven-release-plugin] prepare for next development iteration Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e480bcfb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e480bcfb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e480bcfb Branch: refs/heads/branch-1.0 Commit: e480bcfbd269ae1d7a6a92cfb50466cf192fe1fb Parents: 54133ab Author: Patrick Wendell Authored: Wed May 14 17:50:40 2014 + Committer: Patrick Wendell Committed: Wed May 14 17:50:40 2014 + -- assembly/pom.xml | 2 +- bagel/pom.xml | 2 +- core/pom.xml | 2 +- examples/pom.xml | 2 +- external/flume/pom.xml| 2 +- external/kafka/pom.xml| 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml | 2 +- external/zeromq/pom.xml | 2 +- extras/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml| 2 +- mllib/pom.xml | 2 +- pom.xml | 4 ++-- repl/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- yarn/pom.xml | 2 +- yarn/stable/pom.xml | 2 +- 21 files changed, 22 insertions(+), 22 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e480bcfb/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index 79b1b1f..f79766d 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent -1.0.0 +1.0.1-SNAPSHOT ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/e480bcfb/bagel/pom.xml -- diff --git a/bagel/pom.xml b/bagel/pom.xml index 08932bb..85f6d99 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent -1.0.0 +1.0.1-SNAPSHOT ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/e480bcfb/core/pom.xml -- diff --git a/core/pom.xml b/core/pom.xml index 3e22641..47c2507 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent -1.0.0 +1.0.1-SNAPSHOT ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/e480bcfb/examples/pom.xml -- diff --git a/examples/pom.xml b/examples/pom.xml index 006757a..b7cbb1a 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent -1.0.0 +1.0.1-SNAPSHOT ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/e480bcfb/external/flume/pom.xml -- diff --git a/external/flume/pom.xml b/external/flume/pom.xml index 3ba984e..b8fc07f 100644 --- a/external/flume/pom.xml +++ b/external/flume/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent -1.0.0 +1.0.1-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/e480bcfb/external/kafka/pom.xml -- diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml index cb4dd47..9eeb2e1 100644 --- a/external/kafka/pom.xml +++ b/external/kafka/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent -1.0.0 +1.0.1-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/e480bcfb/external/mqtt/pom.xml -- diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml index b10916d..f4272ce 100644 --- a/external/mqtt/pom.xml +++ b/external/mqtt/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent -1.0.0 +1.0.1-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/e480bcfb/external/twitter/pom.xml -- diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml index f33b583..e4c2302 100644 --- a/external/twitter/pom.xml +++ b/external/twitter/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent -1.0.0 +1.0.1-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/e480bcfb/external/zer
git commit: [SPARK-1620] Handle uncaught exceptions in function run by Akka scheduler
Repository: spark Updated Branches: refs/heads/branch-1.0 34f6fa921 -> 9ff9078fc [SPARK-1620] Handle uncaught exceptions in function run by Akka scheduler If the intended behavior was that uncaught exceptions thrown in functions being run by the Akka scheduler would end up being handled by the default uncaught exception handler set in Executor, and if that behavior is, in fact, correct, then this is a way to accomplish that. I'm not certain, though, that we shouldn't be doing something different to handle uncaught exceptions from some of these scheduled functions. In any event, this PR covers all of the cases I comment on in [SPARK-1620](https://issues.apache.org/jira/browse/SPARK-1620). Author: Mark Hamstra Closes #622 from markhamstra/SPARK-1620 and squashes the following commits: 071d193 [Mark Hamstra] refactored post-SPARK-1772 1a6a35e [Mark Hamstra] another style fix d30eb94 [Mark Hamstra] scalastyle 3573ecd [Mark Hamstra] Use wrapped try/catch in Utils.tryOrExit 8fc0439 [Mark Hamstra] Make functions run by the Akka scheduler use Executor's UncaughtExceptionHandler (cherry picked from commit 17f3075bc4aa8cbed165f7b367f70e84b1bc8db9) Signed-off-by: Patrick Wendell Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9ff9078f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9ff9078f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9ff9078f Branch: refs/heads/branch-1.0 Commit: 9ff9078fc840c05c75f635b7a6acc5080b8e1185 Parents: 34f6fa9 Author: Mark Hamstra Authored: Wed May 14 10:07:25 2014 -0700 Committer: Patrick Wendell Committed: Wed May 14 10:07:39 2014 -0700 -- .../apache/spark/deploy/client/AppClient.scala| 18 ++ .../org/apache/spark/deploy/worker/Worker.scala | 18 ++ .../spark/scheduler/TaskSchedulerImpl.scala | 3 ++- .../org/apache/spark/storage/BlockManager.scala | 2 +- .../main/scala/org/apache/spark/util/Utils.scala | 13 + 5 files changed, 36 insertions(+), 18 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9ff9078f/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala b/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala index 896913d..d38e9e7 100644 --- a/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala +++ b/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala @@ -30,7 +30,7 @@ import org.apache.spark.{Logging, SparkConf, SparkException} import org.apache.spark.deploy.{ApplicationDescription, ExecutorState} import org.apache.spark.deploy.DeployMessages._ import org.apache.spark.deploy.master.Master -import org.apache.spark.util.AkkaUtils +import org.apache.spark.util.{Utils, AkkaUtils} /** * Interface allowing applications to speak with a Spark deploy cluster. Takes a master URL, @@ -88,13 +88,15 @@ private[spark] class AppClient( var retries = 0 registrationRetryTimer = Some { context.system.scheduler.schedule(REGISTRATION_TIMEOUT, REGISTRATION_TIMEOUT) { - retries += 1 - if (registered) { -registrationRetryTimer.foreach(_.cancel()) - } else if (retries >= REGISTRATION_RETRIES) { -markDead("All masters are unresponsive! Giving up.") - } else { -tryRegisterAllMasters() + Utils.tryOrExit { +retries += 1 +if (registered) { + registrationRetryTimer.foreach(_.cancel()) +} else if (retries >= REGISTRATION_RETRIES) { + markDead("All masters are unresponsive! Giving up.") +} else { + tryRegisterAllMasters() +} } } } http://git-wip-us.apache.org/repos/asf/spark/blob/9ff9078f/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala index 85d25dc..134624c 100755 --- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala @@ -166,14 +166,16 @@ private[spark] class Worker( var retries = 0 registrationRetryTimer = Some { context.system.scheduler.schedule(REGISTRATION_TIMEOUT, REGISTRATION_TIMEOUT) { -retries += 1 -if (registered) { - registrationRetryTimer.foreach(_.cancel()) -} else if (retries >= REGISTRATION_RETRIES) { - logError("All masters are unresponsive! Giving up.") -
git commit: [SPARK-1620] Handle uncaught exceptions in function run by Akka scheduler
Repository: spark Updated Branches: refs/heads/master d58cb33ff -> 17f3075bc [SPARK-1620] Handle uncaught exceptions in function run by Akka scheduler If the intended behavior was that uncaught exceptions thrown in functions being run by the Akka scheduler would end up being handled by the default uncaught exception handler set in Executor, and if that behavior is, in fact, correct, then this is a way to accomplish that. I'm not certain, though, that we shouldn't be doing something different to handle uncaught exceptions from some of these scheduled functions. In any event, this PR covers all of the cases I comment on in [SPARK-1620](https://issues.apache.org/jira/browse/SPARK-1620). Author: Mark Hamstra Closes #622 from markhamstra/SPARK-1620 and squashes the following commits: 071d193 [Mark Hamstra] refactored post-SPARK-1772 1a6a35e [Mark Hamstra] another style fix d30eb94 [Mark Hamstra] scalastyle 3573ecd [Mark Hamstra] Use wrapped try/catch in Utils.tryOrExit 8fc0439 [Mark Hamstra] Make functions run by the Akka scheduler use Executor's UncaughtExceptionHandler Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/17f3075b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/17f3075b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/17f3075b Branch: refs/heads/master Commit: 17f3075bc4aa8cbed165f7b367f70e84b1bc8db9 Parents: d58cb33 Author: Mark Hamstra Authored: Wed May 14 10:07:25 2014 -0700 Committer: Patrick Wendell Committed: Wed May 14 10:07:25 2014 -0700 -- .../apache/spark/deploy/client/AppClient.scala| 18 ++ .../org/apache/spark/deploy/worker/Worker.scala | 18 ++ .../spark/scheduler/TaskSchedulerImpl.scala | 3 ++- .../org/apache/spark/storage/BlockManager.scala | 2 +- .../main/scala/org/apache/spark/util/Utils.scala | 13 + 5 files changed, 36 insertions(+), 18 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/17f3075b/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala b/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala index 896913d..d38e9e7 100644 --- a/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala +++ b/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala @@ -30,7 +30,7 @@ import org.apache.spark.{Logging, SparkConf, SparkException} import org.apache.spark.deploy.{ApplicationDescription, ExecutorState} import org.apache.spark.deploy.DeployMessages._ import org.apache.spark.deploy.master.Master -import org.apache.spark.util.AkkaUtils +import org.apache.spark.util.{Utils, AkkaUtils} /** * Interface allowing applications to speak with a Spark deploy cluster. Takes a master URL, @@ -88,13 +88,15 @@ private[spark] class AppClient( var retries = 0 registrationRetryTimer = Some { context.system.scheduler.schedule(REGISTRATION_TIMEOUT, REGISTRATION_TIMEOUT) { - retries += 1 - if (registered) { -registrationRetryTimer.foreach(_.cancel()) - } else if (retries >= REGISTRATION_RETRIES) { -markDead("All masters are unresponsive! Giving up.") - } else { -tryRegisterAllMasters() + Utils.tryOrExit { +retries += 1 +if (registered) { + registrationRetryTimer.foreach(_.cancel()) +} else if (retries >= REGISTRATION_RETRIES) { + markDead("All masters are unresponsive! Giving up.") +} else { + tryRegisterAllMasters() +} } } } http://git-wip-us.apache.org/repos/asf/spark/blob/17f3075b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala index 85d25dc..134624c 100755 --- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala @@ -166,14 +166,16 @@ private[spark] class Worker( var retries = 0 registrationRetryTimer = Some { context.system.scheduler.schedule(REGISTRATION_TIMEOUT, REGISTRATION_TIMEOUT) { -retries += 1 -if (registered) { - registrationRetryTimer.foreach(_.cancel()) -} else if (retries >= REGISTRATION_RETRIES) { - logError("All masters are unresponsive! Giving up.") - System.exit(1) -} else { - tryRegisterAllMasters() +Utils.tryOrExit { +
git commit: SPARK-1828: Created forked version of hive-exec that doesn't bundle other dependencies
Repository: spark Updated Branches: refs/heads/branch-1.0 fc6b65227 -> 34f6fa921 SPARK-1828: Created forked version of hive-exec that doesn't bundle other dependencies See https://issues.apache.org/jira/browse/SPARK-1828 for more information. This is being submitted to Jenkin's for testing. The dependency won't fully propagate in Maven central for a few more hours. Author: Patrick Wendell Closes #767 from pwendell/hive-shaded and squashes the following commits: ea10ac5 [Patrick Wendell] SPARK-1828: Created forked version of hive-exec that doesn't bundle other dependencies (cherry picked from commit d58cb33ffa9e98a64cecea7b40ce7bfbed145079) Signed-off-by: Patrick Wendell Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/34f6fa92 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/34f6fa92 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/34f6fa92 Branch: refs/heads/branch-1.0 Commit: 34f6fa92155ba70d7b17315664618a007f9325ab Parents: fc6b652 Author: Patrick Wendell Authored: Wed May 14 09:51:01 2014 -0700 Committer: Patrick Wendell Committed: Wed May 14 09:51:11 2014 -0700 -- project/SparkBuild.scala | 6 +++--- sql/hive/pom.xml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/34f6fa92/project/SparkBuild.scala -- diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index cca3fba..2d10d89 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -489,9 +489,9 @@ object SparkBuild extends Build { name := "spark-hive", javaOptions += "-XX:MaxPermSize=1g", libraryDependencies ++= Seq( - "org.apache.hive" % "hive-metastore" % hiveVersion, - "org.apache.hive" % "hive-exec" % hiveVersion, - "org.apache.hive" % "hive-serde" % hiveVersion + "org.spark-project.hive" % "hive-metastore" % hiveVersion, + "org.spark-project.hive" % "hive-exec" % hiveVersion, + "org.spark-project.hive" % "hive-serde" % hiveVersion ), // Multiple queries rely on the TestHive singleton. See comments there for more details. parallelExecution in Test := false, http://git-wip-us.apache.org/repos/asf/spark/blob/34f6fa92/sql/hive/pom.xml -- diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml index 4fd3cb0..0c55657 100644 --- a/sql/hive/pom.xml +++ b/sql/hive/pom.xml @@ -43,12 +43,12 @@ ${project.version} - org.apache.hive + org.spark-project.hive hive-metastore ${hive.version} - org.apache.hive + org.spark-project.hive hive-exec ${hive.version} @@ -63,7 +63,7 @@ jackson-mapper-asl - org.apache.hive + org.spark-project.hive hive-serde ${hive.version}
git commit: SPARK-1828: Created forked version of hive-exec that doesn't bundle other dependencies
Repository: spark Updated Branches: refs/heads/master d1d41ccee -> d58cb33ff SPARK-1828: Created forked version of hive-exec that doesn't bundle other dependencies See https://issues.apache.org/jira/browse/SPARK-1828 for more information. This is being submitted to Jenkin's for testing. The dependency won't fully propagate in Maven central for a few more hours. Author: Patrick Wendell Closes #767 from pwendell/hive-shaded and squashes the following commits: ea10ac5 [Patrick Wendell] SPARK-1828: Created forked version of hive-exec that doesn't bundle other dependencies Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d58cb33f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d58cb33f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d58cb33f Branch: refs/heads/master Commit: d58cb33ffa9e98a64cecea7b40ce7bfbed145079 Parents: d1d41cc Author: Patrick Wendell Authored: Wed May 14 09:51:01 2014 -0700 Committer: Patrick Wendell Committed: Wed May 14 09:51:01 2014 -0700 -- project/SparkBuild.scala | 6 +++--- sql/hive/pom.xml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d58cb33f/project/SparkBuild.scala -- diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 57b3e22..8d56b40 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -489,9 +489,9 @@ object SparkBuild extends Build { name := "spark-hive", javaOptions += "-XX:MaxPermSize=1g", libraryDependencies ++= Seq( - "org.apache.hive" % "hive-metastore" % hiveVersion, - "org.apache.hive" % "hive-exec" % hiveVersion, - "org.apache.hive" % "hive-serde" % hiveVersion + "org.spark-project.hive" % "hive-metastore" % hiveVersion, + "org.spark-project.hive" % "hive-exec" % hiveVersion, + "org.spark-project.hive" % "hive-serde" % hiveVersion ), // Multiple queries rely on the TestHive singleton. See comments there for more details. parallelExecution in Test := false, http://git-wip-us.apache.org/repos/asf/spark/blob/d58cb33f/sql/hive/pom.xml -- diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml index 8b32451..9254b70 100644 --- a/sql/hive/pom.xml +++ b/sql/hive/pom.xml @@ -44,12 +44,12 @@ ${project.version} - org.apache.hive + org.spark-project.hive hive-metastore ${hive.version} - org.apache.hive + org.spark-project.hive hive-exec ${hive.version} @@ -64,7 +64,7 @@ jackson-mapper-asl - org.apache.hive + org.spark-project.hive hive-serde ${hive.version}
git commit: SPARK-1818 Freshen Mesos documentation
Repository: spark Updated Branches: refs/heads/branch-1.0 7083282ea -> fc6b65227 SPARK-1818 Freshen Mesos documentation Place more emphasis on using precompiled binary versions of Spark and Mesos instead of encouraging the reader to compile from source. Author: Andrew Ash Closes #756 from ash211/spark-1818 and squashes the following commits: 7ef3b33 [Andrew Ash] Brief explanation of the interactions between Spark and Mesos e7dea8e [Andrew Ash] Add troubleshooting and debugging section 956362d [Andrew Ash] Don't need to pass spark.executor.uri into the spark shell de3353b [Andrew Ash] Wrap to 100char 7ebf6ef [Andrew Ash] Polish on the section on Mesos Master URLs 3dcc2c1 [Andrew Ash] Use --tgz parameter of make-distribution 41b68ed [Andrew Ash] Period at end of sentence; formatting on :5050 8bf2c53 [Andrew Ash] Update site.MESOS_VERSIOn to match /pom.xml 74f2040 [Andrew Ash] SPARK-1818 Freshen Mesos documentation (cherry picked from commit d1d41ccee49a5c093cb61c791c01f64f2076b83e) Signed-off-by: Patrick Wendell Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fc6b6522 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fc6b6522 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fc6b6522 Branch: refs/heads/branch-1.0 Commit: fc6b652275249051ba017f7c043f162a18bf4ae4 Parents: 7083282 Author: Andrew Ash Authored: Wed May 14 09:45:33 2014 -0700 Committer: Patrick Wendell Committed: Wed May 14 09:46:34 2014 -0700 -- docs/_config.yml | 2 +- docs/running-on-mesos.md | 200 -- 2 files changed, 174 insertions(+), 28 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fc6b6522/docs/_config.yml -- diff --git a/docs/_config.yml b/docs/_config.yml index ed27949..9cb27ff 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -7,6 +7,6 @@ SPARK_VERSION: 1.0.0 SPARK_VERSION_SHORT: 1.0.0 SCALA_BINARY_VERSION: "2.10" SCALA_VERSION: "2.10.4" -MESOS_VERSION: 0.13.0 +MESOS_VERSION: 0.18.1 SPARK_ISSUE_TRACKER_URL: https://issues.apache.org/jira/browse/SPARK SPARK_GITHUB_URL: https://github.com/apache/spark http://git-wip-us.apache.org/repos/asf/spark/blob/fc6b6522/docs/running-on-mesos.md -- diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md index 68259f0..ef762aa 100644 --- a/docs/running-on-mesos.md +++ b/docs/running-on-mesos.md @@ -3,19 +3,123 @@ layout: global title: Running Spark on Mesos --- -Spark can run on clusters managed by [Apache Mesos](http://mesos.apache.org/). Follow the steps below to install Mesos and Spark: - -1. Download and build Spark using the instructions [here](index.html). **Note:** Don't forget to consider what version of HDFS you might want to use! -2. Download, build, install, and start Mesos {{site.MESOS_VERSION}} on your cluster. You can download the Mesos distribution from a [mirror](http://www.apache.org/dyn/closer.cgi/mesos/{{site.MESOS_VERSION}}/). See the Mesos [Getting Started](http://mesos.apache.org/gettingstarted) page for more information. **Note:** If you want to run Mesos without installing it into the default paths on your system (e.g., if you don't have administrative privileges to install it), you should also pass the `--prefix` option to `configure` to tell it where to install. For example, pass `--prefix=/home/user/mesos`. By default the prefix is `/usr/local`. -3. Create a Spark "distribution" using `make-distribution.sh`. -4. Rename the `dist` directory created from `make-distribution.sh` to `spark-{{site.SPARK_VERSION}}`. -5. Create a `tar` archive: `tar czf spark-{{site.SPARK_VERSION}}.tar.gz spark-{{site.SPARK_VERSION}}` -6. Upload this archive to HDFS or another place accessible from Mesos via `http://`, e.g., [Amazon Simple Storage Service](http://aws.amazon.com/s3): `hadoop fs -put spark-{{site.SPARK_VERSION}}.tar.gz /path/to/spark-{{site.SPARK_VERSION}}.tar.gz` -7. Create a file called `spark-env.sh` in Spark's `conf` directory, by copying `conf/spark-env.sh.template`, and add the following lines to it: - * `export MESOS_NATIVE_LIBRARY=`. This path is usually `/lib/libmesos.so` (where the prefix is `/usr/local` by default, see above). Also, on Mac OS X, the library is called `libmesos.dylib` instead of `libmesos.so`. - * `export SPARK_EXECUTOR_URI=`. - * `export MASTER=mesos://HOST:PORT` where HOST:PORT is the host and port (default: 5050) of your Mesos master (or `zk://...` if using Mesos with ZooKeeper). -8. To run a Spark application against the cluster, when you create your `SparkContext`, pass the string `mesos://HOST:PORT` as the master URL. In addition, you'll need to set the `spark.e
git commit: SPARK-1818 Freshen Mesos documentation
Repository: spark Updated Branches: refs/heads/master 2e5a7cde2 -> d1d41ccee SPARK-1818 Freshen Mesos documentation Place more emphasis on using precompiled binary versions of Spark and Mesos instead of encouraging the reader to compile from source. Author: Andrew Ash Closes #756 from ash211/spark-1818 and squashes the following commits: 7ef3b33 [Andrew Ash] Brief explanation of the interactions between Spark and Mesos e7dea8e [Andrew Ash] Add troubleshooting and debugging section 956362d [Andrew Ash] Don't need to pass spark.executor.uri into the spark shell de3353b [Andrew Ash] Wrap to 100char 7ebf6ef [Andrew Ash] Polish on the section on Mesos Master URLs 3dcc2c1 [Andrew Ash] Use --tgz parameter of make-distribution 41b68ed [Andrew Ash] Period at end of sentence; formatting on :5050 8bf2c53 [Andrew Ash] Update site.MESOS_VERSIOn to match /pom.xml 74f2040 [Andrew Ash] SPARK-1818 Freshen Mesos documentation Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d1d41cce Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d1d41cce Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d1d41cce Branch: refs/heads/master Commit: d1d41ccee49a5c093cb61c791c01f64f2076b83e Parents: 2e5a7cd Author: Andrew Ash Authored: Wed May 14 09:45:33 2014 -0700 Committer: Patrick Wendell Committed: Wed May 14 09:46:20 2014 -0700 -- docs/_config.yml | 2 +- docs/running-on-mesos.md | 200 -- 2 files changed, 174 insertions(+), 28 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d1d41cce/docs/_config.yml -- diff --git a/docs/_config.yml b/docs/_config.yml index d177e38..45b78fe 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -7,6 +7,6 @@ SPARK_VERSION: 1.0.0-SNAPSHOT SPARK_VERSION_SHORT: 1.0.0 SCALA_BINARY_VERSION: "2.10" SCALA_VERSION: "2.10.4" -MESOS_VERSION: 0.13.0 +MESOS_VERSION: 0.18.1 SPARK_ISSUE_TRACKER_URL: https://issues.apache.org/jira/browse/SPARK SPARK_GITHUB_URL: https://github.com/apache/spark http://git-wip-us.apache.org/repos/asf/spark/blob/d1d41cce/docs/running-on-mesos.md -- diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md index 68259f0..ef762aa 100644 --- a/docs/running-on-mesos.md +++ b/docs/running-on-mesos.md @@ -3,19 +3,123 @@ layout: global title: Running Spark on Mesos --- -Spark can run on clusters managed by [Apache Mesos](http://mesos.apache.org/). Follow the steps below to install Mesos and Spark: - -1. Download and build Spark using the instructions [here](index.html). **Note:** Don't forget to consider what version of HDFS you might want to use! -2. Download, build, install, and start Mesos {{site.MESOS_VERSION}} on your cluster. You can download the Mesos distribution from a [mirror](http://www.apache.org/dyn/closer.cgi/mesos/{{site.MESOS_VERSION}}/). See the Mesos [Getting Started](http://mesos.apache.org/gettingstarted) page for more information. **Note:** If you want to run Mesos without installing it into the default paths on your system (e.g., if you don't have administrative privileges to install it), you should also pass the `--prefix` option to `configure` to tell it where to install. For example, pass `--prefix=/home/user/mesos`. By default the prefix is `/usr/local`. -3. Create a Spark "distribution" using `make-distribution.sh`. -4. Rename the `dist` directory created from `make-distribution.sh` to `spark-{{site.SPARK_VERSION}}`. -5. Create a `tar` archive: `tar czf spark-{{site.SPARK_VERSION}}.tar.gz spark-{{site.SPARK_VERSION}}` -6. Upload this archive to HDFS or another place accessible from Mesos via `http://`, e.g., [Amazon Simple Storage Service](http://aws.amazon.com/s3): `hadoop fs -put spark-{{site.SPARK_VERSION}}.tar.gz /path/to/spark-{{site.SPARK_VERSION}}.tar.gz` -7. Create a file called `spark-env.sh` in Spark's `conf` directory, by copying `conf/spark-env.sh.template`, and add the following lines to it: - * `export MESOS_NATIVE_LIBRARY=`. This path is usually `/lib/libmesos.so` (where the prefix is `/usr/local` by default, see above). Also, on Mac OS X, the library is called `libmesos.dylib` instead of `libmesos.so`. - * `export SPARK_EXECUTOR_URI=`. - * `export MASTER=mesos://HOST:PORT` where HOST:PORT is the host and port (default: 5050) of your Mesos master (or `zk://...` if using Mesos with ZooKeeper). -8. To run a Spark application against the cluster, when you create your `SparkContext`, pass the string `mesos://HOST:PORT` as the master URL. In addition, you'll need to set the `spark.executor.uri` property. For example: +# Why Mesos + +Spark can run on hardware clusters managed by [Ap
git commit: SPARK-1827. LICENSE and NOTICE files need a refresh to contain transitive dependency info
Repository: spark Updated Branches: refs/heads/branch-1.0 c7571d8c6 -> 7083282ea SPARK-1827. LICENSE and NOTICE files need a refresh to contain transitive dependency info LICENSE and NOTICE policy is explained here: http://www.apache.org/dev/licensing-howto.html http://www.apache.org/legal/3party.html This leads to the following changes. First, this change enables two extensions to maven-shade-plugin in assembly/ that will try to include and merge all NOTICE and LICENSE files. This can't hurt. This generates a consolidated NOTICE file that I manually added to NOTICE. Next, a list of all dependencies and their licenses was generated: `mvn ... license:aggregate-add-third-party` to create: `target/generated-sources/license/THIRD-PARTY.txt` Each dependency is listed with one or more licenses. Determine the most-compatible license for each if there is more than one. For "unknown" license dependencies, I manually evaluateD their license. Many are actually Apache projects or components of projects covered already. The only non-trivial one was Colt, which has its own (compatible) license. I ignored Apache-licensed and public domain dependencies as these require no further action (beyond NOTICE above). BSD and MIT licenses (permissive Category A licenses) are evidently supposed to be mentioned in LICENSE, so I added a section without output from the THIRD-PARTY.txt file appropriately. Everything else, Category B licenses, are evidently mentioned in NOTICE (?) Same there. LICENSE contained some license statements for source code that is redistributed. I left this as I think that is the right place to put it. Author: Sean Owen Closes #770 from srowen/SPARK-1827 and squashes the following commits: a764504 [Sean Owen] Add LICENSE and NOTICE info for all transitive dependencies as of 1.0 (cherry picked from commit 2e5a7cde223c8bf6d34e46b27ac94a965441584d) Signed-off-by: Patrick Wendell Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7083282e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7083282e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7083282e Branch: refs/heads/branch-1.0 Commit: 7083282eaea9a1256b1047c0be9c07dbaba175ce Parents: c7571d8 Author: Sean Owen Authored: Wed May 14 09:38:33 2014 -0700 Committer: Patrick Wendell Committed: Wed May 14 09:38:46 2014 -0700 -- LICENSE | 103 + NOTICE | 572 +- assembly/pom.xml | 2 + 3 files changed, 671 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7083282e/LICENSE -- diff --git a/LICENSE b/LICENSE index 1c1c2c0..383f079 100644 --- a/LICENSE +++ b/LICENSE @@ -428,3 +428,106 @@ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON A THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +For colt: + + +Copyright (c) 1999 CERN - European Organization for Nuclear Research. +Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose is hereby granted without fee, provided that the above copyright notice appear in all copies and that both that copyright notice and this permission notice appear in supporting documentation. CERN makes no representations about the suitability of this software for any purpose. It is provided "as is" without expressed or implied warranty. + +Packages hep.aida.* + +Written by Pavel Binko, Dino Ferrero Merlino, Wolfgang Hoschek, Tony Johnson, Andreas Pfeiffer, and others. Check the FreeHEP home page for more info. Permission to use and/or redistribute this work is granted under the terms of the LGPL License, with the exception that any usage related to military applications is expressly forbidden. The software and documentation made available under the terms of this license are provided with no warranty. + + + +Fo SnapTree: + + +SNAPTREE LICENSE + +Copyright (c) 2009-2012 Stanford University, unless otherwise specified. +All rights reserved. + +This software was developed by the Pervasive Parallelism Laboratory of +Stanford University, California, USA. + +Permission to use, copy, modify, and distribute this software in source +or binary form for any purpose with or without fee is hereby gra
git commit: SPARK-1827. LICENSE and NOTICE files need a refresh to contain transitive dependency info
Repository: spark Updated Branches: refs/heads/master 68f28dabe -> 2e5a7cde2 SPARK-1827. LICENSE and NOTICE files need a refresh to contain transitive dependency info LICENSE and NOTICE policy is explained here: http://www.apache.org/dev/licensing-howto.html http://www.apache.org/legal/3party.html This leads to the following changes. First, this change enables two extensions to maven-shade-plugin in assembly/ that will try to include and merge all NOTICE and LICENSE files. This can't hurt. This generates a consolidated NOTICE file that I manually added to NOTICE. Next, a list of all dependencies and their licenses was generated: `mvn ... license:aggregate-add-third-party` to create: `target/generated-sources/license/THIRD-PARTY.txt` Each dependency is listed with one or more licenses. Determine the most-compatible license for each if there is more than one. For "unknown" license dependencies, I manually evaluateD their license. Many are actually Apache projects or components of projects covered already. The only non-trivial one was Colt, which has its own (compatible) license. I ignored Apache-licensed and public domain dependencies as these require no further action (beyond NOTICE above). BSD and MIT licenses (permissive Category A licenses) are evidently supposed to be mentioned in LICENSE, so I added a section without output from the THIRD-PARTY.txt file appropriately. Everything else, Category B licenses, are evidently mentioned in NOTICE (?) Same there. LICENSE contained some license statements for source code that is redistributed. I left this as I think that is the right place to put it. Author: Sean Owen Closes #770 from srowen/SPARK-1827 and squashes the following commits: a764504 [Sean Owen] Add LICENSE and NOTICE info for all transitive dependencies as of 1.0 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2e5a7cde Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2e5a7cde Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2e5a7cde Branch: refs/heads/master Commit: 2e5a7cde223c8bf6d34e46b27ac94a965441584d Parents: 68f28da Author: Sean Owen Authored: Wed May 14 09:38:33 2014 -0700 Committer: Patrick Wendell Committed: Wed May 14 09:38:33 2014 -0700 -- LICENSE | 103 + NOTICE | 572 +- assembly/pom.xml | 2 + 3 files changed, 671 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2e5a7cde/LICENSE -- diff --git a/LICENSE b/LICENSE index 1c1c2c0..383f079 100644 --- a/LICENSE +++ b/LICENSE @@ -428,3 +428,106 @@ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON A THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +For colt: + + +Copyright (c) 1999 CERN - European Organization for Nuclear Research. +Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose is hereby granted without fee, provided that the above copyright notice appear in all copies and that both that copyright notice and this permission notice appear in supporting documentation. CERN makes no representations about the suitability of this software for any purpose. It is provided "as is" without expressed or implied warranty. + +Packages hep.aida.* + +Written by Pavel Binko, Dino Ferrero Merlino, Wolfgang Hoschek, Tony Johnson, Andreas Pfeiffer, and others. Check the FreeHEP home page for more info. Permission to use and/or redistribute this work is granted under the terms of the LGPL License, with the exception that any usage related to military applications is expressly forbidden. The software and documentation made available under the terms of this license are provided with no warranty. + + + +Fo SnapTree: + + +SNAPTREE LICENSE + +Copyright (c) 2009-2012 Stanford University, unless otherwise specified. +All rights reserved. + +This software was developed by the Pervasive Parallelism Laboratory of +Stanford University, California, USA. + +Permission to use, copy, modify, and distribute this software in source +or binary form for any purpose with or without fee is hereby granted, +provided that the following conditions are met: + + 1. Redistributions of source code must retain the
git commit: Fix dep exclusion: avro-ipc, not avro, depends on netty.
Repository: spark Updated Branches: refs/heads/branch-1.0 7da80a318 -> b3d987893 Fix dep exclusion: avro-ipc, not avro, depends on netty. Author: Marcelo Vanzin Closes #763 from vanzin/netty-dep-hell and squashes the following commits: dfb6ce2 [Marcelo Vanzin] Fix dep exclusion: avro-ipc, not avro, depends on netty. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b3d98789 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b3d98789 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b3d98789 Branch: refs/heads/branch-1.0 Commit: b3d987893a87a94c13865b9710c363ab163a2a08 Parents: 7da80a3 Author: Marcelo Vanzin Authored: Wed May 14 00:37:57 2014 -0700 Committer: Patrick Wendell Committed: Wed May 14 00:40:08 2014 -0700 -- pom.xml | 10 -- 1 file changed, 4 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b3d98789/pom.xml -- diff --git a/pom.xml b/pom.xml index 30b7181..d107e05 100644 --- a/pom.xml +++ b/pom.xml @@ -495,12 +495,6 @@ org.apache.avro avro ${avro.version} - - -io.netty -netty - - org.apache.avro @@ -508,6 +502,10 @@ ${avro.version} +io.netty +netty + + org.mortbay.jetty jetty
git commit: Implement ApproximateCountDistinct for SparkSql
Repository: spark Updated Branches: refs/heads/master 92cebada0 -> c33b8dcbf Implement ApproximateCountDistinct for SparkSql Add the implementation for ApproximateCountDistinct to SparkSql. We use the HyperLogLog algorithm implemented in stream-lib, and do the count in two phases: 1) counting the number of distinct elements in each partitions, and 2) merge the HyperLogLog results from different partitions. A simple serializer and test cases are added as well. Author: larvaboy Closes #737 from larvaboy/master and squashes the following commits: bd8ef3f [larvaboy] Add support of user-provided standard deviation to ApproxCountDistinct. 9ba8360 [larvaboy] Fix alignment and null handling issues. 95b4067 [larvaboy] Add a test case for count distinct and approximate count distinct. f57917d [larvaboy] Add the parser for the approximate count. a2d5d10 [larvaboy] Add ApproximateCountDistinct aggregates and functions. 7ad273a [larvaboy] Add SparkSql serializer for HyperLogLog. 1d9aacf [larvaboy] Fix a minor typo in the toString method of the Count case class. 653542b [larvaboy] Fix a couple of minor typos. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c33b8dcb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c33b8dcb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c33b8dcb Branch: refs/heads/master Commit: c33b8dcbf65a3a0c5ee5e65cd1dcdbc7da36aa5f Parents: 92cebad Author: larvaboy Authored: Tue May 13 21:26:08 2014 -0700 Committer: Reynold Xin Committed: Tue May 13 21:26:08 2014 -0700 -- .../org/apache/spark/rdd/PairRDDFunctions.scala | 6 +- .../apache/spark/sql/catalyst/SqlParser.scala | 7 ++ .../sql/catalyst/expressions/aggregates.scala | 78 +++- .../sql/execution/SparkSqlSerializer.scala | 17 + .../org/apache/spark/sql/SQLQuerySuite.scala| 21 +- 5 files changed, 122 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c33b8dcb/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index 5efb438..bc6d204 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -217,7 +217,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) * Return approximate number of distinct values for each key in this RDD. * The accuracy of approximation can be controlled through the relative standard deviation * (relativeSD) parameter, which also controls the amount of memory used. Lower values result in - * more accurate counts but increase the memory footprint and vise versa. Uses the provided + * more accurate counts but increase the memory footprint and vice versa. Uses the provided * Partitioner to partition the output RDD. */ def countApproxDistinctByKey(relativeSD: Double, partitioner: Partitioner): RDD[(K, Long)] = { @@ -232,7 +232,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) * Return approximate number of distinct values for each key in this RDD. * The accuracy of approximation can be controlled through the relative standard deviation * (relativeSD) parameter, which also controls the amount of memory used. Lower values result in - * more accurate counts but increase the memory footprint and vise versa. HashPartitions the + * more accurate counts but increase the memory footprint and vice versa. HashPartitions the * output RDD into numPartitions. * */ @@ -244,7 +244,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) * Return approximate number of distinct values for each key this RDD. * The accuracy of approximation can be controlled through the relative standard deviation * (relativeSD) parameter, which also controls the amount of memory used. Lower values result in - * more accurate counts but increase the memory footprint and vise versa. The default value of + * more accurate counts but increase the memory footprint and vice versa. The default value of * relativeSD is 0.05. Hash-partitions the output RDD using the existing partitioner/parallelism * level. */ http://git-wip-us.apache.org/repos/asf/spark/blob/c33b8dcb/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala index b3a3a1e..f2b9b2c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spar
git commit: Fixed streaming examples docs to use run-example instead of spark-submit
Repository: spark Updated Branches: refs/heads/branch-1.0 69ec3149f -> c7571d8c6 Fixed streaming examples docs to use run-example instead of spark-submit Pretty self-explanatory Author: Tathagata Das Closes #722 from tdas/example-fix and squashes the following commits: 7839979 [Tathagata Das] Minor changes. 0673441 [Tathagata Das] Fixed java docs of java streaming example e687123 [Tathagata Das] Fixed scala style errors. 9b8d112 [Tathagata Das] Fixed streaming examples docs to use run-example instead of spark-submit. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c7571d8c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c7571d8c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c7571d8c Branch: refs/heads/branch-1.0 Commit: c7571d8c6ba058b67cca2b910fd0efacc06642cd Parents: 69ec314 Author: Tathagata Das Authored: Wed May 14 04:17:32 2014 -0700 Committer: Tathagata Das Committed: Wed May 14 04:24:48 2014 -0700 -- .../examples/streaming/JavaCustomReceiver.java | 13 ++--- .../examples/streaming/JavaFlumeEventCount.java | 6 +- .../examples/streaming/JavaKafkaWordCount.java | 6 +- .../streaming/JavaNetworkWordCount.java | 13 +++-- .../examples/streaming/ActorWordCount.scala | 6 +- .../examples/streaming/CustomReceiver.scala | 19 +++ .../examples/streaming/FlumeEventCount.scala| 9 ++- .../examples/streaming/HdfsWordCount.scala | 5 +- .../examples/streaming/KafkaWordCount.scala | 6 +- .../examples/streaming/MQTTWordCount.scala | 10 ++-- .../examples/streaming/NetworkWordCount.scala | 14 +++-- .../streaming/RecoverableNetworkWordCount.scala | 7 +-- .../streaming/StatefulNetworkWordCount.scala| 6 +- .../examples/streaming/TwitterPopularTags.scala | 22 +++- .../examples/streaming/ZeroMQWordCount.scala| 8 +-- .../clickstream/PageViewGenerator.scala | 10 ++-- .../streaming/clickstream/PageViewStream.scala | 7 ++- .../streaming/twitter/TwitterInputDStream.scala | 58 18 files changed, 130 insertions(+), 95 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c7571d8c/examples/src/main/java/org/apache/spark/examples/streaming/JavaCustomReceiver.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/streaming/JavaCustomReceiver.java b/examples/src/main/java/org/apache/spark/examples/streaming/JavaCustomReceiver.java index 7f558f3..5622df5 100644 --- a/examples/src/main/java/org/apache/spark/examples/streaming/JavaCustomReceiver.java +++ b/examples/src/main/java/org/apache/spark/examples/streaming/JavaCustomReceiver.java @@ -19,6 +19,7 @@ package org.apache.spark.examples.streaming; import com.google.common.collect.Lists; +import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; @@ -48,25 +49,23 @@ import java.util.regex.Pattern; * To run this on your local machine, you need to first run a Netcat server *`$ nc -lk ` * and then run the example - *`$ ./run org.apache.spark.examples.streaming.JavaCustomReceiver local[2] localhost ` + *`$ bin/run-example org.apache.spark.examples.streaming.JavaCustomReceiver localhost ` */ public class JavaCustomReceiver extends Receiver { private static final Pattern SPACE = Pattern.compile(" "); public static void main(String[] args) { -if (args.length < 3) { - System.err.println("Usage: JavaNetworkWordCount \n" + - "In local mode, should be 'local[n]' with n > 1"); +if (args.length < 2) { + System.err.println("Usage: JavaNetworkWordCount "); System.exit(1); } StreamingExamples.setStreamingLogLevels(); // Create the context with a 1 second batch size -JavaStreamingContext ssc = new JavaStreamingContext(args[0], "JavaNetworkWordCount", -new Duration(1000), System.getenv("SPARK_HOME"), -JavaStreamingContext.jarOfClass(JavaNetworkWordCount.class)); +SparkConf sparkConf = new SparkConf().setAppName("JavaCustomReceiver"); +JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, new Duration(1000)); // Create a input stream with the custom receiver on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') http://git-wip-us.apache.org/repos/asf/spark/blob/c7571d8c/examples/src/main/java/org/apache/spark/examples/streaming/JavaFlumeEventCount.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/streaming/JavaFlumeEvent
git commit: [SPARK-1769] Executor loss causes NPE race condition
Repository: spark Updated Branches: refs/heads/branch-1.0 b3d987893 -> 69ec3149f [SPARK-1769] Executor loss causes NPE race condition This PR replaces the Schedulable data structures in Pool.scala with thread-safe ones from java. Note that Scala's `with SynchronizedBuffer` trait is soon to be deprecated in 2.11 because it is ["inherently unreliable"](http://www.scala-lang.org/api/2.11.0/index.html#scala.collection.mutable.SynchronizedBuffer). We should slowly drift away from `SynchronizedBuffer` in other places too. Note that this PR introduces an API-breaking change; `sc.getAllPools` now returns an Array rather than an ArrayBuffer. This is because we want this method to return an immutable copy rather than one may potentially confuse the user if they try to modify the copy, which takes no effect on the original data structure. Author: Andrew Or Closes #762 from andrewor14/pool-npe and squashes the following commits: 383e739 [Andrew Or] JavaConverters -> JavaConversions 3f32981 [Andrew Or] Merge branch 'master' of github.com:apache/spark into pool-npe 769be19 [Andrew Or] Assorted minor changes 2189247 [Andrew Or] Merge branch 'master' of github.com:apache/spark into pool-npe 05ad9e9 [Andrew Or] Fix test - contains is not the same as containsKey 0921ea0 [Andrew Or] var -> val 07d720c [Andrew Or] Synchronize Schedulable data structures (cherry picked from commit 69f750228f3ec8537a93da08e712596fa8004143) Signed-off-by: Aaron Davidson Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/69ec3149 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/69ec3149 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/69ec3149 Branch: refs/heads/branch-1.0 Commit: 69ec3149fb4d732935748b9afee4f9d8a7b1244e Parents: b3d9878 Author: Andrew Or Authored: Wed May 14 00:54:33 2014 -0700 Committer: Aaron Davidson Committed: Wed May 14 00:54:49 2014 -0700 -- .../scala/org/apache/spark/SparkContext.scala | 20 - .../scala/org/apache/spark/scheduler/Pool.scala | 31 ++-- .../apache/spark/scheduler/Schedulable.scala| 6 ++-- .../spark/scheduler/TaskSchedulerImpl.scala | 2 +- .../scheduler/TaskSchedulerImplSuite.scala | 2 +- 5 files changed, 35 insertions(+), 26 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/69ec3149/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index c43b4fd..032b3d7 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -17,15 +17,17 @@ package org.apache.spark +import scala.language.implicitConversions + import java.io._ import java.net.URI import java.util.concurrent.atomic.AtomicInteger import java.util.{Properties, UUID} import java.util.UUID.randomUUID import scala.collection.{Map, Set} +import scala.collection.JavaConversions._ import scala.collection.generic.Growable -import scala.collection.mutable.{ArrayBuffer, HashMap} -import scala.language.implicitConversions +import scala.collection.mutable.HashMap import scala.reflect.{ClassTag, classTag} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path @@ -836,18 +838,22 @@ class SparkContext(config: SparkConf) extends Logging { } /** - * Return pools for fair scheduler - * TODO(xiajunluan): We should take nested pools into account + * :: DeveloperApi :: + * Return pools for fair scheduler */ - def getAllPools: ArrayBuffer[Schedulable] = { -taskScheduler.rootPool.schedulableQueue + @DeveloperApi + def getAllPools: Seq[Schedulable] = { +// TODO(xiajunluan): We should take nested pools into account +taskScheduler.rootPool.schedulableQueue.toSeq } /** + * :: DeveloperApi :: * Return the pool associated with the given name, if one exists */ + @DeveloperApi def getPoolForName(pool: String): Option[Schedulable] = { -taskScheduler.rootPool.schedulableNameToSchedulable.get(pool) +Option(taskScheduler.rootPool.schedulableNameToSchedulable.get(pool)) } /** http://git-wip-us.apache.org/repos/asf/spark/blob/69ec3149/core/src/main/scala/org/apache/spark/scheduler/Pool.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/Pool.scala b/core/src/main/scala/org/apache/spark/scheduler/Pool.scala index 187672c..174b732 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/Pool.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/Pool.scala @@ -17,8 +17,10 @@ package org.apache.spark.schedu
git commit: [SPARK-1769] Executor loss causes NPE race condition
Repository: spark Updated Branches: refs/heads/master 54ae8328b -> 69f750228 [SPARK-1769] Executor loss causes NPE race condition This PR replaces the Schedulable data structures in Pool.scala with thread-safe ones from java. Note that Scala's `with SynchronizedBuffer` trait is soon to be deprecated in 2.11 because it is ["inherently unreliable"](http://www.scala-lang.org/api/2.11.0/index.html#scala.collection.mutable.SynchronizedBuffer). We should slowly drift away from `SynchronizedBuffer` in other places too. Note that this PR introduces an API-breaking change; `sc.getAllPools` now returns an Array rather than an ArrayBuffer. This is because we want this method to return an immutable copy rather than one may potentially confuse the user if they try to modify the copy, which takes no effect on the original data structure. Author: Andrew Or Closes #762 from andrewor14/pool-npe and squashes the following commits: 383e739 [Andrew Or] JavaConverters -> JavaConversions 3f32981 [Andrew Or] Merge branch 'master' of github.com:apache/spark into pool-npe 769be19 [Andrew Or] Assorted minor changes 2189247 [Andrew Or] Merge branch 'master' of github.com:apache/spark into pool-npe 05ad9e9 [Andrew Or] Fix test - contains is not the same as containsKey 0921ea0 [Andrew Or] var -> val 07d720c [Andrew Or] Synchronize Schedulable data structures Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/69f75022 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/69f75022 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/69f75022 Branch: refs/heads/master Commit: 69f750228f3ec8537a93da08e712596fa8004143 Parents: 54ae832 Author: Andrew Or Authored: Wed May 14 00:54:33 2014 -0700 Committer: Aaron Davidson Committed: Wed May 14 00:54:33 2014 -0700 -- .../scala/org/apache/spark/SparkContext.scala | 20 - .../scala/org/apache/spark/scheduler/Pool.scala | 31 ++-- .../apache/spark/scheduler/Schedulable.scala| 6 ++-- .../spark/scheduler/TaskSchedulerImpl.scala | 2 +- .../scheduler/TaskSchedulerImplSuite.scala | 2 +- 5 files changed, 35 insertions(+), 26 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/69f75022/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index c43b4fd..032b3d7 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -17,15 +17,17 @@ package org.apache.spark +import scala.language.implicitConversions + import java.io._ import java.net.URI import java.util.concurrent.atomic.AtomicInteger import java.util.{Properties, UUID} import java.util.UUID.randomUUID import scala.collection.{Map, Set} +import scala.collection.JavaConversions._ import scala.collection.generic.Growable -import scala.collection.mutable.{ArrayBuffer, HashMap} -import scala.language.implicitConversions +import scala.collection.mutable.HashMap import scala.reflect.{ClassTag, classTag} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path @@ -836,18 +838,22 @@ class SparkContext(config: SparkConf) extends Logging { } /** - * Return pools for fair scheduler - * TODO(xiajunluan): We should take nested pools into account + * :: DeveloperApi :: + * Return pools for fair scheduler */ - def getAllPools: ArrayBuffer[Schedulable] = { -taskScheduler.rootPool.schedulableQueue + @DeveloperApi + def getAllPools: Seq[Schedulable] = { +// TODO(xiajunluan): We should take nested pools into account +taskScheduler.rootPool.schedulableQueue.toSeq } /** + * :: DeveloperApi :: * Return the pool associated with the given name, if one exists */ + @DeveloperApi def getPoolForName(pool: String): Option[Schedulable] = { -taskScheduler.rootPool.schedulableNameToSchedulable.get(pool) +Option(taskScheduler.rootPool.schedulableNameToSchedulable.get(pool)) } /** http://git-wip-us.apache.org/repos/asf/spark/blob/69f75022/core/src/main/scala/org/apache/spark/scheduler/Pool.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/Pool.scala b/core/src/main/scala/org/apache/spark/scheduler/Pool.scala index 187672c..174b732 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/Pool.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/Pool.scala @@ -17,8 +17,10 @@ package org.apache.spark.scheduler +import java.util.concurrent.{ConcurrentHashMap, ConcurrentLinkedQueue} + +import scala.collection.Java
git commit: Fix dep exclusion: avro-ipc, not avro, depends on netty.
Repository: spark Updated Branches: refs/heads/master b22952fa1 -> 54ae8328b Fix dep exclusion: avro-ipc, not avro, depends on netty. Author: Marcelo Vanzin Closes #763 from vanzin/netty-dep-hell and squashes the following commits: dfb6ce2 [Marcelo Vanzin] Fix dep exclusion: avro-ipc, not avro, depends on netty. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/54ae8328 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/54ae8328 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/54ae8328 Branch: refs/heads/master Commit: 54ae8328bd7d052ba347768cfb02cb5dfdd8045e Parents: b22952f Author: Marcelo Vanzin Authored: Wed May 14 00:37:57 2014 -0700 Committer: Patrick Wendell Committed: Wed May 14 00:37:57 2014 -0700 -- pom.xml | 10 -- 1 file changed, 4 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/54ae8328/pom.xml -- diff --git a/pom.xml b/pom.xml index 4d4c5f6..786b6d4 100644 --- a/pom.xml +++ b/pom.xml @@ -496,12 +496,6 @@ org.apache.avro avro ${avro.version} - - -io.netty -netty - - org.apache.avro @@ -509,6 +503,10 @@ ${avro.version} +io.netty +netty + + org.mortbay.jetty jetty
git commit: SPARK-1801. expose InterruptibleIterator and TaskKilledException in deve...
Repository: spark Updated Branches: refs/heads/branch-1.0 f66f76648 -> 7da80a318 SPARK-1801. expose InterruptibleIterator and TaskKilledException in deve... ...loper api Author: Koert Kuipers Closes #764 from koertkuipers/feat-rdd-developerapi and squashes the following commits: 8516dd2 [Koert Kuipers] SPARK-1801. expose InterruptibleIterator and TaskKilledException in developer api (cherry picked from commit b22952fa1f21c0b93208846b5e1941a9d2578c6f) Signed-off-by: Aaron Davidson Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7da80a31 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7da80a31 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7da80a31 Branch: refs/heads/branch-1.0 Commit: 7da80a3186e9120c26ed88dc1211356a1d5eb8af Parents: f66f766 Author: Koert Kuipers Authored: Wed May 14 00:10:12 2014 -0700 Committer: Aaron Davidson Committed: Wed May 14 00:12:59 2014 -0700 -- .../main/scala/org/apache/spark/InterruptibleIterator.scala | 6 +- .../main/scala/org/apache/spark/TaskKilledException.scala| 8 ++-- 2 files changed, 11 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7da80a31/core/src/main/scala/org/apache/spark/InterruptibleIterator.scala -- diff --git a/core/src/main/scala/org/apache/spark/InterruptibleIterator.scala b/core/src/main/scala/org/apache/spark/InterruptibleIterator.scala index ec11dbb..f40baa8 100644 --- a/core/src/main/scala/org/apache/spark/InterruptibleIterator.scala +++ b/core/src/main/scala/org/apache/spark/InterruptibleIterator.scala @@ -17,11 +17,15 @@ package org.apache.spark +import org.apache.spark.annotation.DeveloperApi + /** + * :: DeveloperApi :: * An iterator that wraps around an existing iterator to provide task killing functionality. * It works by checking the interrupted flag in [[TaskContext]]. */ -private[spark] class InterruptibleIterator[+T](val context: TaskContext, val delegate: Iterator[T]) +@DeveloperApi +class InterruptibleIterator[+T](val context: TaskContext, val delegate: Iterator[T]) extends Iterator[T] { def hasNext: Boolean = { http://git-wip-us.apache.org/repos/asf/spark/blob/7da80a31/core/src/main/scala/org/apache/spark/TaskKilledException.scala -- diff --git a/core/src/main/scala/org/apache/spark/TaskKilledException.scala b/core/src/main/scala/org/apache/spark/TaskKilledException.scala index cbd6b28..ad487c4 100644 --- a/core/src/main/scala/org/apache/spark/TaskKilledException.scala +++ b/core/src/main/scala/org/apache/spark/TaskKilledException.scala @@ -17,7 +17,11 @@ package org.apache.spark +import org.apache.spark.annotation.DeveloperApi + /** - * Exception for a task getting killed. + * :: DeveloperApi :: + * Exception thrown when a task is explicitly killed (i.e., task failure is expected). */ -private[spark] class TaskKilledException extends RuntimeException +@DeveloperApi +class TaskKilledException extends RuntimeException
git commit: SPARK-1801. expose InterruptibleIterator and TaskKilledException in deve...
Repository: spark Updated Branches: refs/heads/master 6ce088444 -> b22952fa1 SPARK-1801. expose InterruptibleIterator and TaskKilledException in deve... ...loper api Author: Koert Kuipers Closes #764 from koertkuipers/feat-rdd-developerapi and squashes the following commits: 8516dd2 [Koert Kuipers] SPARK-1801. expose InterruptibleIterator and TaskKilledException in developer api Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b22952fa Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b22952fa Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b22952fa Branch: refs/heads/master Commit: b22952fa1f21c0b93208846b5e1941a9d2578c6f Parents: 6ce0884 Author: Koert Kuipers Authored: Wed May 14 00:10:12 2014 -0700 Committer: Aaron Davidson Committed: Wed May 14 00:12:35 2014 -0700 -- .../main/scala/org/apache/spark/InterruptibleIterator.scala | 6 +- .../main/scala/org/apache/spark/TaskKilledException.scala| 8 ++-- 2 files changed, 11 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b22952fa/core/src/main/scala/org/apache/spark/InterruptibleIterator.scala -- diff --git a/core/src/main/scala/org/apache/spark/InterruptibleIterator.scala b/core/src/main/scala/org/apache/spark/InterruptibleIterator.scala index ec11dbb..f40baa8 100644 --- a/core/src/main/scala/org/apache/spark/InterruptibleIterator.scala +++ b/core/src/main/scala/org/apache/spark/InterruptibleIterator.scala @@ -17,11 +17,15 @@ package org.apache.spark +import org.apache.spark.annotation.DeveloperApi + /** + * :: DeveloperApi :: * An iterator that wraps around an existing iterator to provide task killing functionality. * It works by checking the interrupted flag in [[TaskContext]]. */ -private[spark] class InterruptibleIterator[+T](val context: TaskContext, val delegate: Iterator[T]) +@DeveloperApi +class InterruptibleIterator[+T](val context: TaskContext, val delegate: Iterator[T]) extends Iterator[T] { def hasNext: Boolean = { http://git-wip-us.apache.org/repos/asf/spark/blob/b22952fa/core/src/main/scala/org/apache/spark/TaskKilledException.scala -- diff --git a/core/src/main/scala/org/apache/spark/TaskKilledException.scala b/core/src/main/scala/org/apache/spark/TaskKilledException.scala index cbd6b28..ad487c4 100644 --- a/core/src/main/scala/org/apache/spark/TaskKilledException.scala +++ b/core/src/main/scala/org/apache/spark/TaskKilledException.scala @@ -17,7 +17,11 @@ package org.apache.spark +import org.apache.spark.annotation.DeveloperApi + /** - * Exception for a task getting killed. + * :: DeveloperApi :: + * Exception thrown when a task is explicitly killed (i.e., task failure is expected). */ -private[spark] class TaskKilledException extends RuntimeException +@DeveloperApi +class TaskKilledException extends RuntimeException