[1/2] [SPARK-1594][MLLIB] Cleaning up MLlib APIs and guide

matei Mon, 05 May 2014 18:33:17 -0700

Repository: spark
Updated Branches:
  refs/heads/master ea10b3126 -> 98750a74d



http://git-wip-us.apache.org/repos/asf/spark/blob/98750a74/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala 
b/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
index 350130c..be383aa 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
@@ -17,10 +17,8 @@
 
 package org.apache.spark.mllib.tree
 
-import org.scalatest.BeforeAndAfterAll
 import org.scalatest.FunSuite
 
-import org.apache.spark.SparkContext
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.impurity.{Entropy, Gini, Variance}
 import org.apache.spark.mllib.tree.model.Filter
@@ -28,19 +26,9 @@ import org.apache.spark.mllib.tree.configuration.Strategy
 import org.apache.spark.mllib.tree.configuration.Algo._
 import org.apache.spark.mllib.tree.configuration.FeatureType._
 import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.LocalSparkContext
 
-class DecisionTreeSuite extends FunSuite with BeforeAndAfterAll {
-
-  @transient private var sc: SparkContext = _
-
-  override def beforeAll() {
-    sc = new SparkContext("local", "test")
-  }
-
-  override def afterAll() {
-    sc.stop()
-    System.clearProperty("spark.driver.port")
-  }
+class DecisionTreeSuite extends FunSuite with LocalSparkContext {
 
   test("split and bin calculation") {
     val arr = DecisionTreeSuite.generateOrderedLabeledPointsWithLabel1()

http://git-wip-us.apache.org/repos/asf/spark/blob/98750a74/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala 
b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
index 674378a..3f64baf 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
@@ -19,8 +19,8 @@ package org.apache.spark.mllib.util
 
 import java.io.File
 
+import scala.io.Source
 import scala.math
-import scala.util.Random
 
 import org.scalatest.FunSuite
 
@@ -29,7 +29,8 @@ import breeze.linalg.{DenseVector => BDV, SparseVector => 
BSV, norm => breezeNor
 import com.google.common.base.Charsets
 import com.google.common.io.Files
 
-import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
+import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.MLUtils._
 
 class MLUtilsSuite extends FunSuite with LocalSparkContext {
@@ -58,7 +59,7 @@ class MLUtilsSuite extends FunSuite with LocalSparkContext {
     }
   }
 
-  test("loadLibSVMData") {
+  test("loadLibSVMFile") {
     val lines =
       """
         |+1 1:1.0 3:2.0 5:3.0
@@ -70,8 +71,8 @@ class MLUtilsSuite extends FunSuite with LocalSparkContext {
     Files.write(lines, file, Charsets.US_ASCII)
     val path = tempDir.toURI.toString
 
-    val pointsWithNumFeatures = MLUtils.loadLibSVMData(sc, path, 
BinaryLabelParser, 6).collect()
-    val pointsWithoutNumFeatures = MLUtils.loadLibSVMData(sc, path).collect()
+    val pointsWithNumFeatures = loadLibSVMFile(sc, path, multiclass = false, 
6).collect()
+    val pointsWithoutNumFeatures = loadLibSVMFile(sc, path).collect()
 
     for (points <- Seq(pointsWithNumFeatures, pointsWithoutNumFeatures)) {
       assert(points.length === 3)
@@ -83,29 +84,54 @@ class MLUtilsSuite extends FunSuite with LocalSparkContext {
       assert(points(2).features === Vectors.sparse(6, Seq((1, 4.0), (3, 5.0), 
(5, 6.0))))
     }
 
-    val multiclassPoints = MLUtils.loadLibSVMData(sc, path, 
MulticlassLabelParser).collect()
+    val multiclassPoints = loadLibSVMFile(sc, path, multiclass = 
true).collect()
     assert(multiclassPoints.length === 3)
     assert(multiclassPoints(0).label === 1.0)
     assert(multiclassPoints(1).label === -1.0)
     assert(multiclassPoints(2).label === -1.0)
 
-    try {
-      file.delete()
-      tempDir.delete()
-    } catch {
-      case t: Throwable =>
-    }
+    deleteQuietly(tempDir)
+  }
+
+  test("saveAsLibSVMFile") {
+    val examples = sc.parallelize(Seq(
+      LabeledPoint(1.1, Vectors.sparse(3, Seq((0, 1.23), (2, 4.56)))),
+      LabeledPoint(0.0, Vectors.dense(1.01, 2.02, 3.03))
+    ), 2)
+    val tempDir = Files.createTempDir()
+    val outputDir = new File(tempDir, "output")
+    MLUtils.saveAsLibSVMFile(examples, outputDir.toURI.toString)
+    val lines = outputDir.listFiles()
+      .filter(_.getName.startsWith("part-"))
+      .flatMap(Source.fromFile(_).getLines())
+      .toSet
+    val expected = Set("1.1 1:1.23 3:4.56", "0.0 1:1.01 2:2.02 3:3.03")
+    assert(lines === expected)
+    deleteQuietly(tempDir)
+  }
+
+  test("appendBias") {
+    val sv = Vectors.sparse(3, Seq((0, 1.0), (2, 3.0)))
+    val sv1 = appendBias(sv).asInstanceOf[SparseVector]
+    assert(sv1.size === 4)
+    assert(sv1.indices === Array(0, 2, 3))
+    assert(sv1.values === Array(1.0, 3.0, 1.0))
+
+    val dv = Vectors.dense(1.0, 0.0, 3.0)
+    val dv1 = appendBias(dv).asInstanceOf[DenseVector]
+    assert(dv1.size === 4)
+    assert(dv1.values === Array(1.0, 0.0, 3.0, 1.0))
   }
 
   test("kFold") {
     val data = sc.parallelize(1 to 100, 2)
     val collectedData = data.collect().sorted
-    val twoFoldedRdd = MLUtils.kFold(data, 2, 1)
+    val twoFoldedRdd = kFold(data, 2, 1)
     assert(twoFoldedRdd(0)._1.collect().sorted === 
twoFoldedRdd(1)._2.collect().sorted)
     assert(twoFoldedRdd(0)._2.collect().sorted === 
twoFoldedRdd(1)._1.collect().sorted)
     for (folds <- 2 to 10) {
       for (seed <- 1 to 5) {
-        val foldedRdds = MLUtils.kFold(data, folds, seed)
+        val foldedRdds = kFold(data, folds, seed)
         assert(foldedRdds.size === folds)
         foldedRdds.map { case (training, validation) =>
           val result = validation.union(training).collect().sorted
@@ -132,4 +158,16 @@ class MLUtilsSuite extends FunSuite with LocalSparkContext 
{
     }
   }
 
+  /** Delete a file/directory quietly. */
+  def deleteQuietly(f: File) {
+    if (f.isDirectory) {
+      f.listFiles().foreach(deleteQuietly)
+    }
+    try {
+      f.delete()
+    } catch {
+      case _: Throwable =>
+    }
+  }
 }
+

http://git-wip-us.apache.org/repos/asf/spark/blob/98750a74/python/pyspark/mllib/classification.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/classification.py 
b/python/pyspark/mllib/classification.py
index c584459..6772e43 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -90,7 +90,7 @@ class SVMModel(LinearModel):
     >>> svm.predict(array([1.0])) > 0
     True
     >>> sparse_data = [
-    ...     LabeledPoint(0.0, SparseVector(2, {0: 0.0})),
+    ...     LabeledPoint(0.0, SparseVector(2, {0: -1.0})),
     ...     LabeledPoint(1.0, SparseVector(2, {1: 1.0})),
     ...     LabeledPoint(0.0, SparseVector(2, {0: 0.0})),
     ...     LabeledPoint(1.0, SparseVector(2, {1: 2.0}))
@@ -98,7 +98,7 @@ class SVMModel(LinearModel):
     >>> svm = SVMWithSGD.train(sc.parallelize(sparse_data))
     >>> svm.predict(SparseVector(2, {1: 1.0})) > 0
     True
-    >>> svm.predict(SparseVector(2, {1: 0.0})) <= 0
+    >>> svm.predict(SparseVector(2, {0: -1.0})) <= 0
     True
     """
     def predict(self, x):

[1/2] [SPARK-1594][MLLIB] Cleaning up MLlib APIs and guide

Reply via email to