Merge remote-tracking branch 'upstream/master' into sparsesvd
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/845e568f Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/845e568f Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/845e568f Branch: refs/heads/master Commit: 845e568fada0550e632e7381748c5a9ebbe53e16 Parents: f324d53 fdaabdc Author: Reza Zadeh <riz...@gmail.com> Authored: Mon Jan 13 23:52:34 2014 -0800 Committer: Reza Zadeh <riz...@gmail.com> Committed: Mon Jan 13 23:52:34 2014 -0800 ---------------------------------------------------------------------- .../org/apache/spark/bagel/BagelSuite.scala | 1 - bin/compute-classpath.sh | 2 + .../scala/org/apache/spark/Accumulators.scala | 6 +- .../scala/org/apache/spark/Aggregator.scala | 14 +- .../scala/org/apache/spark/CacheManager.scala | 4 +- .../scala/org/apache/spark/FutureAction.scala | 8 +- .../scala/org/apache/spark/HttpFileServer.scala | 6 +- .../apache/spark/InterruptibleIterator.scala | 2 +- .../main/scala/org/apache/spark/Logging.scala | 4 +- .../org/apache/spark/MapOutputTracker.scala | 12 +- .../scala/org/apache/spark/Partitioner.scala | 4 +- .../scala/org/apache/spark/SparkContext.scala | 75 +- .../main/scala/org/apache/spark/SparkEnv.scala | 10 - .../org/apache/spark/SparkHadoopWriter.scala | 15 +- .../org/apache/spark/api/python/PythonRDD.scala | 3 +- .../org/apache/spark/broadcast/Broadcast.scala | 1 + .../spark/broadcast/BroadcastFactory.scala | 2 +- .../apache/spark/broadcast/HttpBroadcast.scala | 5 +- .../spark/broadcast/TorrentBroadcast.scala | 12 +- .../scala/org/apache/spark/deploy/Client.scala | 3 +- .../spark/deploy/worker/CommandUtils.scala | 3 +- .../executor/CoarseGrainedExecutorBackend.scala | 1 - .../org/apache/spark/executor/Executor.scala | 4 +- .../org/apache/spark/executor/TaskMetrics.scala | 10 + .../apache/spark/network/BufferMessage.scala | 2 +- .../org/apache/spark/network/Connection.scala | 6 +- .../org/apache/spark/network/Message.scala | 6 +- .../spark/network/netty/ShuffleSender.scala | 2 +- .../main/scala/org/apache/spark/package.scala | 3 + .../org/apache/spark/rdd/CoGroupedRDD.scala | 3 + .../org/apache/spark/rdd/CoalescedRDD.scala | 10 +- .../scala/org/apache/spark/rdd/HadoopRDD.scala | 32 +- .../org/apache/spark/rdd/NewHadoopRDD.scala | 24 +- .../org/apache/spark/rdd/PairRDDFunctions.scala | 12 +- .../scala/org/apache/spark/rdd/PipedRDD.scala | 5 +- .../main/scala/org/apache/spark/rdd/RDD.scala | 9 +- .../apache/spark/scheduler/DAGScheduler.scala | 4 +- .../spark/scheduler/InputFormatInfo.scala | 8 +- .../scala/org/apache/spark/scheduler/Pool.scala | 8 +- .../spark/scheduler/SchedulingAlgorithm.scala | 11 +- .../apache/spark/scheduler/SparkListener.scala | 16 +- .../spark/scheduler/SparkListenerBus.scala | 17 +- .../org/apache/spark/scheduler/Stage.scala | 2 +- .../org/apache/spark/scheduler/TaskResult.scala | 2 +- .../spark/scheduler/TaskResultGetter.scala | 2 +- .../spark/scheduler/TaskSchedulerImpl.scala | 5 +- .../apache/spark/scheduler/TaskSetManager.scala | 14 +- .../cluster/CoarseGrainedSchedulerBackend.scala | 2 +- .../mesos/CoarseMesosSchedulerBackend.scala | 2 +- .../cluster/mesos/MesosSchedulerBackend.scala | 6 +- .../org/apache/spark/storage/BlockManager.scala | 7 +- .../spark/storage/BlockManagerWorker.scala | 20 +- .../org/apache/spark/storage/BlockMessage.scala | 2 +- .../spark/storage/BlockMessageArray.scala | 2 +- .../spark/storage/BlockObjectWriter.scala | 4 +- .../org/apache/spark/storage/MemoryStore.scala | 2 +- .../spark/storage/ShuffleBlockManager.scala | 2 +- .../org/apache/spark/storage/StorageLevel.scala | 6 +- .../apache/spark/ui/jobs/ExecutorSummary.scala | 2 + .../apache/spark/ui/jobs/ExecutorTable.scala | 4 + .../spark/ui/jobs/JobProgressListener.scala | 14 + .../org/apache/spark/ui/jobs/StagePage.scala | 53 +- .../org/apache/spark/ui/jobs/StageTable.scala | 2 +- .../org/apache/spark/util/ClosureCleaner.scala | 10 +- .../apache/spark/util/CompletionIterator.scala | 11 +- .../org/apache/spark/util/MetadataCleaner.scala | 8 +- .../spark/util/RateLimitedOutputStream.scala | 79 -- .../org/apache/spark/util/SizeEstimator.scala | 10 +- .../scala/org/apache/spark/util/Utils.scala | 36 +- .../scala/org/apache/spark/util/Vector.scala | 12 +- .../spark/util/collection/AppendOnlyMap.scala | 2 +- .../apache/spark/util/collection/BitSet.scala | 87 +- .../util/collection/ExternalAppendOnlyMap.scala | 72 +- .../spark/util/collection/OpenHashSet.scala | 23 +- .../org/apache/spark/LocalSparkContext.scala | 1 - .../apache/spark/MapOutputTrackerSuite.scala | 1 - .../spark/scheduler/ClusterSchedulerSuite.scala | 9 +- .../spark/scheduler/DAGSchedulerSuite.scala | 3 +- .../apache/spark/scheduler/JobLoggerSuite.scala | 7 +- .../spark/storage/BlockManagerSuite.scala | 4 - .../apache/spark/util/ClosureCleanerSuite.scala | 14 +- .../util/RateLimitedOutputStreamSuite.scala | 40 - .../collection/ExternalAppendOnlyMapSuite.scala | 77 +- docs/_config.yml | 2 +- docs/_layouts/global.html | 8 +- docs/_plugins/copy_api_dirs.rb | 2 +- docs/api.md | 1 + docs/bagel-programming-guide.md | 10 +- docs/configuration.md | 13 +- docs/graphx-programming-guide.md | 1003 ++++++++++++++++++ docs/img/data_parallel_vs_graph_parallel.png | Bin 0 -> 432725 bytes docs/img/edge-cut.png | Bin 0 -> 12563 bytes docs/img/edge_cut_vs_vertex_cut.png | Bin 0 -> 79745 bytes docs/img/graph_analytics_pipeline.png | Bin 0 -> 427220 bytes docs/img/graph_parallel.png | Bin 0 -> 92288 bytes docs/img/graphx_figures.pptx | Bin 0 -> 1123363 bytes docs/img/graphx_logo.png | Bin 0 -> 40324 bytes docs/img/graphx_performance_comparison.png | Bin 0 -> 166343 bytes docs/img/property_graph.png | Bin 0 -> 225151 bytes docs/img/tables_and_graphs.png | Bin 0 -> 166265 bytes docs/img/triplet.png | Bin 0 -> 31489 bytes docs/img/vertex-cut.png | Bin 0 -> 12246 bytes docs/img/vertex_routing_edge_tables.png | Bin 0 -> 570007 bytes docs/index.md | 4 +- docs/mllib-guide.md | 19 +- docs/python-programming-guide.md | 8 +- docs/streaming-programming-guide.md | 6 +- .../org/apache/spark/examples/LocalALS.scala | 8 +- .../org/apache/spark/examples/LocalFileLR.scala | 2 +- .../org/apache/spark/examples/LocalKMeans.scala | 2 +- .../org/apache/spark/examples/SparkALS.scala | 6 +- .../org/apache/spark/examples/SparkHdfsLR.scala | 2 +- .../org/apache/spark/examples/SparkKMeans.scala | 12 +- .../examples/graphx/LiveJournalPageRank.scala | 49 + .../streaming/examples/RawNetworkGrep.scala | 2 +- .../examples/RecoverableNetworkWordCount.scala | 2 +- .../streaming/examples/TwitterAlgebirdCMS.scala | 4 +- .../streaming/examples/TwitterAlgebirdHLL.scala | 4 +- .../streaming/examples/TwitterPopularTags.scala | 4 +- .../clickstream/PageViewGenerator.scala | 2 +- .../examples/clickstream/PageViewStream.scala | 2 +- .../spark/streaming/flume/FlumeUtils.scala | 4 +- .../spark/streaming/kafka/KafkaUtils.scala | 6 +- .../apache/spark/streaming/mqtt/MQTTUtils.scala | 4 +- .../spark/streaming/mqtt/MQTTStreamSuite.scala | 2 +- .../spark/streaming/twitter/TwitterUtils.scala | 7 +- .../streaming/twitter/TwitterStreamSuite.scala | 2 +- .../spark/streaming/zeromq/ZeroMQUtils.scala | 3 +- graphx/data/followers.txt | 8 + graphx/data/users.txt | 7 + graphx/pom.xml | 67 ++ .../scala/org/apache/spark/graphx/Edge.scala | 45 + .../org/apache/spark/graphx/EdgeDirection.scala | 44 + .../scala/org/apache/spark/graphx/EdgeRDD.scala | 102 ++ .../org/apache/spark/graphx/EdgeTriplet.scala | 49 + .../scala/org/apache/spark/graphx/Graph.scala | 405 +++++++ .../spark/graphx/GraphKryoRegistrator.scala | 31 + .../org/apache/spark/graphx/GraphLoader.scala | 72 ++ .../org/apache/spark/graphx/GraphOps.scala | 301 ++++++ .../apache/spark/graphx/PartitionStrategy.scala | 103 ++ .../scala/org/apache/spark/graphx/Pregel.scala | 139 +++ .../org/apache/spark/graphx/VertexRDD.scala | 347 ++++++ .../spark/graphx/impl/EdgePartition.scala | 220 ++++ .../graphx/impl/EdgePartitionBuilder.scala | 45 + .../spark/graphx/impl/EdgeTripletIterator.scala | 42 + .../apache/spark/graphx/impl/GraphImpl.scala | 379 +++++++ .../spark/graphx/impl/MessageToPartition.scala | 98 ++ .../graphx/impl/ReplicatedVertexView.scala | 195 ++++ .../apache/spark/graphx/impl/RoutingTable.scala | 65 ++ .../apache/spark/graphx/impl/Serializers.scala | 395 +++++++ .../spark/graphx/impl/VertexPartition.scala | 261 +++++ .../org/apache/spark/graphx/impl/package.scala | 7 + .../org/apache/spark/graphx/lib/Analytics.scala | 136 +++ .../spark/graphx/lib/ConnectedComponents.scala | 38 + .../org/apache/spark/graphx/lib/PageRank.scala | 147 +++ .../apache/spark/graphx/lib/SVDPlusPlus.scala | 138 +++ .../lib/StronglyConnectedComponents.scala | 94 ++ .../apache/spark/graphx/lib/TriangleCount.scala | 76 ++ .../scala/org/apache/spark/graphx/package.scala | 18 + .../spark/graphx/util/BytecodeUtils.scala | 117 ++ .../spark/graphx/util/GraphGenerators.scala | 218 ++++ .../collection/PrimitiveKeyOpenHashMap.scala | 153 +++ graphx/src/test/resources/log4j.properties | 28 + .../org/apache/spark/graphx/GraphOpsSuite.scala | 66 ++ .../org/apache/spark/graphx/GraphSuite.scala | 273 +++++ .../apache/spark/graphx/LocalSparkContext.scala | 28 + .../org/apache/spark/graphx/PregelSuite.scala | 41 + .../apache/spark/graphx/SerializerSuite.scala | 183 ++++ .../apache/spark/graphx/VertexRDDSuite.scala | 85 ++ .../spark/graphx/impl/EdgePartitionSuite.scala | 76 ++ .../graphx/impl/VertexPartitionSuite.scala | 113 ++ .../graphx/lib/ConnectedComponentsSuite.scala | 113 ++ .../apache/spark/graphx/lib/PageRankSuite.scala | 119 +++ .../spark/graphx/lib/SVDPlusPlusSuite.scala | 31 + .../lib/StronglyConnectedComponentsSuite.scala | 57 + .../spark/graphx/lib/TriangleCountSuite.scala | 70 ++ .../spark/graphx/util/BytecodeUtilsSuite.scala | 93 ++ mllib/data/sample_naive_bayes_data.txt | 6 + .../spark/mllib/api/python/PythonMLLibAPI.scala | 46 +- .../classification/LogisticRegression.scala | 4 +- .../spark/mllib/classification/NaiveBayes.scala | 65 +- .../apache/spark/mllib/classification/SVM.scala | 2 + .../apache/spark/mllib/recommendation/ALS.scala | 15 +- .../spark/mllib/regression/LabeledPoint.scala | 6 +- .../apache/spark/mllib/regression/Lasso.scala | 4 +- .../mllib/regression/LinearRegression.scala | 2 + .../mllib/regression/RidgeRegression.scala | 4 +- .../classification/JavaNaiveBayesSuite.java | 72 ++ pom.xml | 5 +- project/SparkBuild.scala | 21 +- python/pyspark/mllib/_common.py | 2 +- python/pyspark/mllib/classification.py | 77 +- python/pyspark/mllib/clustering.py | 11 +- python/pyspark/mllib/recommendation.py | 10 +- python/pyspark/mllib/regression.py | 35 +- python/pyspark/worker.py | 4 + python/run-tests | 5 + .../scala/org/apache/spark/repl/ReplSuite.scala | 2 - .../org/apache/spark/streaming/Checkpoint.scala | 6 +- .../apache/spark/streaming/ContextWaiter.scala | 28 + .../org/apache/spark/streaming/DStream.scala | 741 ------------- .../spark/streaming/DStreamCheckpointData.scala | 128 --- .../apache/spark/streaming/DStreamGraph.scala | 12 +- .../spark/streaming/PairDStreamFunctions.scala | 621 ----------- .../spark/streaming/StreamingContext.scala | 117 +- .../spark/streaming/api/java/JavaDStream.scala | 3 +- .../streaming/api/java/JavaDStreamLike.scala | 30 +- .../streaming/api/java/JavaPairDStream.scala | 1 + .../api/java/JavaStreamingContext.scala | 28 +- .../spark/streaming/dstream/DStream.scala | 775 ++++++++++++++ .../dstream/DStreamCheckpointData.scala | 126 +++ .../streaming/dstream/FileInputDStream.scala | 86 +- .../streaming/dstream/FilteredDStream.scala | 2 +- .../dstream/FlatMapValuedDStream.scala | 2 +- .../streaming/dstream/FlatMappedDStream.scala | 2 +- .../streaming/dstream/ForEachDStream.scala | 2 +- .../streaming/dstream/GlommedDStream.scala | 2 +- .../spark/streaming/dstream/InputDStream.scala | 4 +- .../dstream/MapPartitionedDStream.scala | 2 +- .../streaming/dstream/MapValuedDStream.scala | 2 +- .../spark/streaming/dstream/MappedDStream.scala | 2 +- .../streaming/dstream/NetworkInputDStream.scala | 2 +- .../dstream/PairDStreamFunctions.scala | 622 +++++++++++ .../dstream/ReducedWindowedDStream.scala | 2 +- .../streaming/dstream/ShuffledDStream.scala | 2 +- .../spark/streaming/dstream/StateDStream.scala | 10 +- .../streaming/dstream/TransformedDStream.scala | 2 +- .../spark/streaming/dstream/UnionDStream.scala | 3 +- .../streaming/dstream/WindowedDStream.scala | 17 +- .../apache/spark/streaming/scheduler/Job.scala | 9 +- .../streaming/scheduler/JobGenerator.scala | 81 +- .../streaming/scheduler/JobScheduler.scala | 141 ++- .../spark/streaming/scheduler/JobSet.scala | 18 +- .../scheduler/NetworkInputTracker.scala | 40 +- .../streaming/scheduler/StreamingListener.scala | 3 +- .../scheduler/StreamingListenerBus.scala | 23 +- .../org/apache/spark/streaming/util/Clock.scala | 4 +- .../streaming/util/MasterFailureTest.scala | 4 +- .../util/RateLimitedOutputStream.scala | 79 ++ .../spark/streaming/util/RawTextHelper.scala | 2 +- .../spark/streaming/util/RawTextSender.scala | 13 +- .../spark/streaming/util/RecurringTimer.scala | 13 +- .../streaming/LocalJavaStreamingContext.java | 2 - .../spark/streaming/BasicOperationsSuite.scala | 79 +- .../spark/streaming/CheckpointSuite.scala | 25 +- .../spark/streaming/StreamingContextSuite.scala | 219 ++++ .../streaming/StreamingListenerSuite.scala | 1 + .../apache/spark/streaming/TestSuiteBase.scala | 10 +- .../spark/streaming/WindowOperationsSuite.scala | 15 + .../util/RateLimitedOutputStreamSuite.scala | 40 + .../tools/JavaAPICompletenessChecker.scala | 50 +- .../org/apache/spark/deploy/yarn/Client.scala | 7 +- .../spark/deploy/yarn/WorkerLauncher.scala | 8 +- .../spark/deploy/yarn/WorkerRunnable.scala | 7 +- .../yarn/ClientDistributedCacheManager.scala | 10 +- .../ClientDistributedCacheManagerSuite.scala | 2 +- .../org/apache/spark/deploy/yarn/Client.scala | 3 +- 257 files changed, 10443 insertions(+), 2434 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/845e568f/docs/mllib-guide.md ---------------------------------------------------------------------- diff --cc docs/mllib-guide.md index 21d0464,1a5c640..a140ecb --- a/docs/mllib-guide.md +++ b/docs/mllib-guide.md @@@ -426,55 -435,5 +435,55 @@@ signals), you can use the trainImplici {% highlight python %} # Build the recommendation model using Alternating Least Squares based on implicit ratings - model = ALS.trainImplicit(sc, ratings, 1, 20) + model = ALS.trainImplicit(ratings, 1, 20) {% endhighlight %} + + +# Singular Value Decomposition +Singular Value Decomposition for Tall and Skinny matrices. +Given an *m x n* matrix *A*, we can compute matrices *U, S, V* such that + +*A = U * S * V^T* + +There is no restriction on m, but we require n^2 doubles to fit in memory. +Further, n should be less than m. + +The decomposition is computed by first computing *A^TA = V S^2 V^T*, +computing svd locally on that (since n x n is small), +from which we recover S and V. +Then we compute U via easy matrix multiplication +as *U = A * V * S^-1* + +Only singular vectors associated with largest k singular values +are recovered. If there are k +such values, then the dimensions of the return will be: + +* *S* is *k x k* and diagonal, holding the singular values on diagonal. +* *U* is *m x k* and satisfies U^T*U = eye(k). +* *V* is *n x k* and satisfies V^TV = eye(k). + +All input and output is expected in sparse matrix format, 1-indexed +as tuples of the form ((i,j),value) all in +SparseMatrix RDDs. Below is example usage. + +{% highlight scala %} + +import org.apache.spark.SparkContext +import org.apache.spark.mllib.linalg.SVD +import org.apache.spark.mllib.linalg.SparseMatrix +import org.apache.spark.mllib.linalg.MatrixEntry + +// Load and parse the data file +val data = sc.textFile("mllib/data/als/test.data").map { line => + val parts = line.split(',') + MatrixEntry(parts(0).toInt, parts(1).toInt, parts(2).toDouble) +} +val m = 4 +val n = 4 +val k = 1 + +// recover largest singular vector +val decomposed = SVD.sparseSVD(SparseMatrix(data, m, n), k) +val = decomposed.S.data + +println("singular values = " + s.toArray.mkString)