git commit: SPARK-2293. Replace RDD.zip usage by map with predict inside.

meng Mon, 30 Jun 2014 16:04:29 -0700

Repository: spark
Updated Branches:
  refs/heads/master 5fccb567b -> 04fa1223e



SPARK-2293. Replace RDD.zip usage by map with predict inside.

This is the only occurrence of this pattern in the examples that needs to be 
replaced. It only addresses the example change.

Author: Sean Owen <[email protected]>

Closes #1250 from srowen/SPARK-2293 and squashes the following commits:

6b1b28c [Sean Owen] Compute prediction-and-label RDD directly rather than by 
zipping, for efficiency


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/04fa1223
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/04fa1223
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/04fa1223

Branch: refs/heads/master
Commit: 04fa1223ee69760f0d23b40e56f4b036aa301879
Parents: 5fccb56
Author: Sean Owen <[email protected]>
Authored: Mon Jun 30 16:03:38 2014 -0700
Committer: Xiangrui Meng <[email protected]>
Committed: Mon Jun 30 16:03:38 2014 -0700

----------------------------------------------------------------------
 docs/mllib-naive-bayes.md | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/04fa1223/docs/mllib-naive-bayes.md
----------------------------------------------------------------------
diff --git a/docs/mllib-naive-bayes.md b/docs/mllib-naive-bayes.md
index 4b3a7ca..1d1d7dc 100644
--- a/docs/mllib-naive-bayes.md
+++ b/docs/mllib-naive-bayes.md
@@ -51,9 +51,8 @@ val training = splits(0)
 val test = splits(1)
 
 val model = NaiveBayes.train(training, lambda = 1.0)
-val prediction = model.predict(test.map(_.features))
 
-val predictionAndLabel = prediction.zip(test.map(_.label))
+val predictionAndLabel = test.map(p => (model.predict(p.features), p.label))
 val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / 
test.count()
 {% endhighlight %}
 </div>
@@ -71,6 +70,7 @@ can be used for evaluation and prediction.
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.function.Function;
+import org.apache.spark.api.java.function.PairFunction;
 import org.apache.spark.mllib.classification.NaiveBayes;
 import org.apache.spark.mllib.classification.NaiveBayesModel;
 import org.apache.spark.mllib.regression.LabeledPoint;
@@ -81,18 +81,12 @@ JavaRDD<LabeledPoint> test = ... // test set
 
 final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);
 
-JavaRDD<Double> prediction =
-  test.map(new Function<LabeledPoint, Double>() {
-    @Override public Double call(LabeledPoint p) {
-      return model.predict(p.features());
-    }
-  });
 JavaPairRDD<Double, Double> predictionAndLabel = 
-  prediction.zip(test.map(new Function<LabeledPoint, Double>() {
-    @Override public Double call(LabeledPoint p) {
-      return p.label();
+  test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
+    @Override public Tuple2<Double, Double> call(LabeledPoint p) {
+      return new Tuple2<Double, Double>(model.predict(p.features()), 
p.label());
     }
-  }));
+  });
 double accuracy = 1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, 
Double>, Boolean>() {
     @Override public Boolean call(Tuple2<Double, Double> pl) {
       return pl._1() == pl._2();

git commit: SPARK-2293. Replace RDD.zip usage by map with predict inside.

Reply via email to