Repository: spark Updated Branches: refs/heads/master ea10b3126 -> 98750a74d
http://git-wip-us.apache.org/repos/asf/spark/blob/98750a74/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala ---------------------------------------------------------------------- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala index 350130c..be383aa 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala @@ -17,10 +17,8 @@ package org.apache.spark.mllib.tree -import org.scalatest.BeforeAndAfterAll import org.scalatest.FunSuite -import org.apache.spark.SparkContext import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.impurity.{Entropy, Gini, Variance} import org.apache.spark.mllib.tree.model.Filter @@ -28,19 +26,9 @@ import org.apache.spark.mllib.tree.configuration.Strategy import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.configuration.FeatureType._ import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.util.LocalSparkContext -class DecisionTreeSuite extends FunSuite with BeforeAndAfterAll { - - @transient private var sc: SparkContext = _ - - override def beforeAll() { - sc = new SparkContext("local", "test") - } - - override def afterAll() { - sc.stop() - System.clearProperty("spark.driver.port") - } +class DecisionTreeSuite extends FunSuite with LocalSparkContext { test("split and bin calculation") { val arr = DecisionTreeSuite.generateOrderedLabeledPointsWithLabel1() http://git-wip-us.apache.org/repos/asf/spark/blob/98750a74/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala ---------------------------------------------------------------------- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala index 674378a..3f64baf 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala @@ -19,8 +19,8 @@ package org.apache.spark.mllib.util import java.io.File +import scala.io.Source import scala.math -import scala.util.Random import org.scalatest.FunSuite @@ -29,7 +29,8 @@ import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, norm => breezeNor import com.google.common.base.Charsets import com.google.common.io.Files -import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} +import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils._ class MLUtilsSuite extends FunSuite with LocalSparkContext { @@ -58,7 +59,7 @@ class MLUtilsSuite extends FunSuite with LocalSparkContext { } } - test("loadLibSVMData") { + test("loadLibSVMFile") { val lines = """ |+1 1:1.0 3:2.0 5:3.0 @@ -70,8 +71,8 @@ class MLUtilsSuite extends FunSuite with LocalSparkContext { Files.write(lines, file, Charsets.US_ASCII) val path = tempDir.toURI.toString - val pointsWithNumFeatures = MLUtils.loadLibSVMData(sc, path, BinaryLabelParser, 6).collect() - val pointsWithoutNumFeatures = MLUtils.loadLibSVMData(sc, path).collect() + val pointsWithNumFeatures = loadLibSVMFile(sc, path, multiclass = false, 6).collect() + val pointsWithoutNumFeatures = loadLibSVMFile(sc, path).collect() for (points <- Seq(pointsWithNumFeatures, pointsWithoutNumFeatures)) { assert(points.length === 3) @@ -83,29 +84,54 @@ class MLUtilsSuite extends FunSuite with LocalSparkContext { assert(points(2).features === Vectors.sparse(6, Seq((1, 4.0), (3, 5.0), (5, 6.0)))) } - val multiclassPoints = MLUtils.loadLibSVMData(sc, path, MulticlassLabelParser).collect() + val multiclassPoints = loadLibSVMFile(sc, path, multiclass = true).collect() assert(multiclassPoints.length === 3) assert(multiclassPoints(0).label === 1.0) assert(multiclassPoints(1).label === -1.0) assert(multiclassPoints(2).label === -1.0) - try { - file.delete() - tempDir.delete() - } catch { - case t: Throwable => - } + deleteQuietly(tempDir) + } + + test("saveAsLibSVMFile") { + val examples = sc.parallelize(Seq( + LabeledPoint(1.1, Vectors.sparse(3, Seq((0, 1.23), (2, 4.56)))), + LabeledPoint(0.0, Vectors.dense(1.01, 2.02, 3.03)) + ), 2) + val tempDir = Files.createTempDir() + val outputDir = new File(tempDir, "output") + MLUtils.saveAsLibSVMFile(examples, outputDir.toURI.toString) + val lines = outputDir.listFiles() + .filter(_.getName.startsWith("part-")) + .flatMap(Source.fromFile(_).getLines()) + .toSet + val expected = Set("1.1 1:1.23 3:4.56", "0.0 1:1.01 2:2.02 3:3.03") + assert(lines === expected) + deleteQuietly(tempDir) + } + + test("appendBias") { + val sv = Vectors.sparse(3, Seq((0, 1.0), (2, 3.0))) + val sv1 = appendBias(sv).asInstanceOf[SparseVector] + assert(sv1.size === 4) + assert(sv1.indices === Array(0, 2, 3)) + assert(sv1.values === Array(1.0, 3.0, 1.0)) + + val dv = Vectors.dense(1.0, 0.0, 3.0) + val dv1 = appendBias(dv).asInstanceOf[DenseVector] + assert(dv1.size === 4) + assert(dv1.values === Array(1.0, 0.0, 3.0, 1.0)) } test("kFold") { val data = sc.parallelize(1 to 100, 2) val collectedData = data.collect().sorted - val twoFoldedRdd = MLUtils.kFold(data, 2, 1) + val twoFoldedRdd = kFold(data, 2, 1) assert(twoFoldedRdd(0)._1.collect().sorted === twoFoldedRdd(1)._2.collect().sorted) assert(twoFoldedRdd(0)._2.collect().sorted === twoFoldedRdd(1)._1.collect().sorted) for (folds <- 2 to 10) { for (seed <- 1 to 5) { - val foldedRdds = MLUtils.kFold(data, folds, seed) + val foldedRdds = kFold(data, folds, seed) assert(foldedRdds.size === folds) foldedRdds.map { case (training, validation) => val result = validation.union(training).collect().sorted @@ -132,4 +158,16 @@ class MLUtilsSuite extends FunSuite with LocalSparkContext { } } + /** Delete a file/directory quietly. */ + def deleteQuietly(f: File) { + if (f.isDirectory) { + f.listFiles().foreach(deleteQuietly) + } + try { + f.delete() + } catch { + case _: Throwable => + } + } } + http://git-wip-us.apache.org/repos/asf/spark/blob/98750a74/python/pyspark/mllib/classification.py ---------------------------------------------------------------------- diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py index c584459..6772e43 100644 --- a/python/pyspark/mllib/classification.py +++ b/python/pyspark/mllib/classification.py @@ -90,7 +90,7 @@ class SVMModel(LinearModel): >>> svm.predict(array([1.0])) > 0 True >>> sparse_data = [ - ... LabeledPoint(0.0, SparseVector(2, {0: 0.0})), + ... LabeledPoint(0.0, SparseVector(2, {0: -1.0})), ... LabeledPoint(1.0, SparseVector(2, {1: 1.0})), ... LabeledPoint(0.0, SparseVector(2, {0: 0.0})), ... LabeledPoint(1.0, SparseVector(2, {1: 2.0})) @@ -98,7 +98,7 @@ class SVMModel(LinearModel): >>> svm = SVMWithSGD.train(sc.parallelize(sparse_data)) >>> svm.predict(SparseVector(2, {1: 1.0})) > 0 True - >>> svm.predict(SparseVector(2, {1: 0.0})) <= 0 + >>> svm.predict(SparseVector(2, {0: -1.0})) <= 0 True """ def predict(self, x):
