http://git-wip-us.apache.org/repos/asf/spark/blob/26145a5a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala ---------------------------------------------------------------------- diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index a1b4853..2623759 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -22,28 +22,49 @@ import scala.language.existentials import scala.util.Random import scala.util.control.Breaks._ -import org.apache.spark.SparkFunSuite +import org.apache.spark.{SparkException, SparkFunSuite} +import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.classification.LogisticRegressionSuite._ -import org.apache.spark.ml.feature.{Instance, LabeledPoint} -import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.ml.feature.LabeledPoint +import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, SparseMatrix, SparseVector, Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext -import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions.lit class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { - @transient var dataset: Dataset[_] = _ - @transient var binaryDataset: DataFrame = _ + @transient var smallBinaryDataset: Dataset[_] = _ + @transient var smallMultinomialDataset: Dataset[_] = _ + @transient var binaryDataset: Dataset[_] = _ + @transient var multinomialDataset: Dataset[_] = _ private val eps: Double = 1e-5 override def beforeAll(): Unit = { super.beforeAll() - dataset = spark.createDataFrame(generateLogisticInput(1.0, 1.0, nPoints = 100, seed = 42)) + smallBinaryDataset = + spark.createDataFrame(generateLogisticInput(1.0, 1.0, nPoints = 100, seed = 42)) + + smallMultinomialDataset = { + val nPoints = 100 + val coefficients = Array( + -0.57997, 0.912083, -0.371077, + -0.16624, -0.84355, -0.048509) + + val xMean = Array(5.843, 3.057) + val xVariance = Array(0.6856, 0.1899) + + val testData = generateMultinomialLogisticInput( + coefficients, xMean, xVariance, addIntercept = true, nPoints, 42) + + val df = spark.createDataFrame(sc.parallelize(testData, 4)) + df.cache() + df + } binaryDataset = { val nPoints = 10000 @@ -57,6 +78,23 @@ class LogisticRegressionSuite spark.createDataFrame(sc.parallelize(testData, 4)) } + + multinomialDataset = { + val nPoints = 10000 + val coefficients = Array( + -0.57997, 0.912083, -0.371077, -0.819866, 2.688191, + -0.16624, -0.84355, -0.048509, -0.301789, 4.170682) + + val xMean = Array(5.843, 3.057, 3.758, 1.199) + val xVariance = Array(0.6856, 0.1899, 3.116, 0.581) + + val testData = generateMultinomialLogisticInput( + coefficients, xMean, xVariance, addIntercept = true, nPoints, 42) + + val df = spark.createDataFrame(sc.parallelize(testData, 4)) + df.cache() + df + } } /** @@ -67,6 +105,9 @@ class LogisticRegressionSuite binaryDataset.rdd.map { case Row(label: Double, features: Vector) => label + "," + features.toArray.mkString(",") }.repartition(1).saveAsTextFile("target/tmp/LogisticRegressionSuite/binaryDataset") + multinomialDataset.rdd.map { case Row(label: Double, features: Vector) => + label + "," + features.toArray.mkString(",") + }.repartition(1).saveAsTextFile("target/tmp/LogisticRegressionSuite/multinomialDataset") } test("params") { @@ -82,11 +123,12 @@ class LogisticRegressionSuite assert(lr.getPredictionCol === "prediction") assert(lr.getRawPredictionCol === "rawPrediction") assert(lr.getProbabilityCol === "probability") + assert(lr.getFamily === "auto") assert(!lr.isDefined(lr.weightCol)) assert(lr.getFitIntercept) assert(lr.getStandardization) - val model = lr.fit(dataset) - model.transform(dataset) + val model = lr.fit(smallBinaryDataset) + model.transform(smallBinaryDataset) .select("label", "probability", "prediction", "rawPrediction") .collect() assert(model.getThreshold === 0.5) @@ -100,17 +142,17 @@ class LogisticRegressionSuite test("empty probabilityCol") { val lr = new LogisticRegression().setProbabilityCol("") - val model = lr.fit(dataset) + val model = lr.fit(smallBinaryDataset) assert(model.hasSummary) // Validate that we re-insert a probability column for evaluation val fieldNames = model.summary.predictions.schema.fieldNames - assert(dataset.schema.fieldNames.toSet.subsetOf( + assert(smallBinaryDataset.schema.fieldNames.toSet.subsetOf( fieldNames.toSet)) assert(fieldNames.exists(s => s.startsWith("probability_"))) } test("setThreshold, getThreshold") { - val lr = new LogisticRegression + val lr = new LogisticRegression().setFamily("binomial") // default assert(lr.getThreshold === 0.5, "LogisticRegression.threshold should default to 0.5") withClue("LogisticRegression should not have thresholds set by default.") { @@ -127,7 +169,7 @@ class LogisticRegressionSuite lr.setThreshold(0.5) assert(lr.getThresholds === Array(0.5, 0.5)) // Set via thresholds - val lr2 = new LogisticRegression + val lr2 = new LogisticRegression().setFamily("binomial") lr2.setThresholds(Array(0.3, 0.7)) val expectedThreshold = 1.0 / (1.0 + 0.3 / 0.7) assert(lr2.getThreshold ~== expectedThreshold relTol 1E-7) @@ -141,19 +183,72 @@ class LogisticRegressionSuite // thresholds and threshold must be consistent: values withClue("fit with ParamMap should throw error if threshold, thresholds do not match.") { intercept[IllegalArgumentException] { - val lr2model = lr2.fit(dataset, + val lr2model = lr2.fit(smallBinaryDataset, lr2.thresholds -> Array(0.3, 0.7), lr2.threshold -> (expectedThreshold / 2.0)) lr2model.getThreshold } } } + test("thresholds prediction") { + val blr = new LogisticRegression().setFamily("binomial") + val binaryModel = blr.fit(smallBinaryDataset) + + binaryModel.setThreshold(1.0) + val binaryZeroPredictions = + binaryModel.transform(smallBinaryDataset).select("prediction").collect() + assert(binaryZeroPredictions.forall(_.getDouble(0) === 0.0)) + + binaryModel.setThreshold(0.0) + val binaryOnePredictions = + binaryModel.transform(smallBinaryDataset).select("prediction").collect() + assert(binaryOnePredictions.forall(_.getDouble(0) === 1.0)) + + + val mlr = new LogisticRegression().setFamily("multinomial") + val model = mlr.fit(smallMultinomialDataset) + val basePredictions = model.transform(smallMultinomialDataset).select("prediction").collect() + + // should predict all zeros + model.setThresholds(Array(1, 1000, 1000)) + val zeroPredictions = model.transform(smallMultinomialDataset).select("prediction").collect() + assert(zeroPredictions.forall(_.getDouble(0) === 0.0)) + + // should predict all ones + model.setThresholds(Array(1000, 1, 1000)) + val onePredictions = model.transform(smallMultinomialDataset).select("prediction").collect() + assert(onePredictions.forall(_.getDouble(0) === 1.0)) + + // should predict all twos + model.setThresholds(Array(1000, 1000, 1)) + val twoPredictions = model.transform(smallMultinomialDataset).select("prediction").collect() + assert(twoPredictions.forall(_.getDouble(0) === 2.0)) + + // constant threshold scaling is the same as no thresholds + model.setThresholds(Array(1000, 1000, 1000)) + val scaledPredictions = model.transform(smallMultinomialDataset).select("prediction").collect() + assert(scaledPredictions.zip(basePredictions).forall { case (scaled, base) => + scaled.getDouble(0) === base.getDouble(0) + }) + + // force it to use the predict method + model.setRawPredictionCol("").setProbabilityCol("").setThresholds(Array(0, 1, 1)) + val predictionsWithPredict = + model.transform(smallMultinomialDataset).select("prediction").collect() + assert(predictionsWithPredict.forall(_.getDouble(0) === 0.0)) + } + test("logistic regression doesn't fit intercept when fitIntercept is off") { - val lr = new LogisticRegression + val lr = new LogisticRegression().setFamily("binomial") lr.setFitIntercept(false) - val model = lr.fit(dataset) + val model = lr.fit(smallBinaryDataset) assert(model.intercept === 0.0) + val mlr = new LogisticRegression().setFamily("multinomial") + mlr.setFitIntercept(false) + val mlrModel = mlr.fit(smallMultinomialDataset) + assert(mlrModel.interceptVector === Vectors.sparse(3, Seq())) + // copied model must have the same parent. MLTestingUtils.checkCopy(model) } @@ -165,7 +260,7 @@ class LogisticRegressionSuite .setRegParam(1.0) .setThreshold(0.6) .setProbabilityCol("myProbability") - val model = lr.fit(dataset) + val model = lr.fit(smallBinaryDataset) val parent = model.parent.asInstanceOf[LogisticRegression] assert(parent.getMaxIter === 10) assert(parent.getRegParam === 1.0) @@ -174,16 +269,16 @@ class LogisticRegressionSuite // Modify model params, and check that the params worked. model.setThreshold(1.0) - val predAllZero = model.transform(dataset) + val predAllZero = model.transform(smallBinaryDataset) .select("prediction", "myProbability") .collect() .map { case Row(pred: Double, prob: Vector) => pred } assert(predAllZero.forall(_ === 0), s"With threshold=1.0, expected predictions to be all 0, but only" + - s" ${predAllZero.count(_ === 0)} of ${dataset.count()} were 0.") + s" ${predAllZero.count(_ === 0)} of ${smallBinaryDataset.count()} were 0.") // Call transform with params, and check that the params worked. val predNotAllZero = - model.transform(dataset, model.threshold -> 0.0, + model.transform(smallBinaryDataset, model.threshold -> 0.0, model.probabilityCol -> "myProb") .select("prediction", "myProb") .collect() @@ -192,7 +287,7 @@ class LogisticRegressionSuite // Call fit() with new params, and check as many params as we can. lr.setThresholds(Array(0.6, 0.4)) - val model2 = lr.fit(dataset, lr.maxIter -> 5, lr.regParam -> 0.1, + val model2 = lr.fit(smallBinaryDataset, lr.maxIter -> 5, lr.regParam -> 0.1, lr.probabilityCol -> "theProb") val parent2 = model2.parent.asInstanceOf[LogisticRegression] assert(parent2.getMaxIter === 5) @@ -202,17 +297,82 @@ class LogisticRegressionSuite assert(model2.getProbabilityCol === "theProb") } - test("logistic regression: Predictor, Classifier methods") { - val spark = this.spark - val lr = new LogisticRegression + test("multinomial logistic regression: Predictor, Classifier methods") { + val sqlContext = smallMultinomialDataset.sqlContext + import sqlContext.implicits._ + val mlr = new LogisticRegression().setFamily("multinomial") + + val model = mlr.fit(smallMultinomialDataset) + assert(model.numClasses === 3) + val numFeatures = smallMultinomialDataset.select("features").first().getAs[Vector](0).size + assert(model.numFeatures === numFeatures) + + val results = model.transform(smallMultinomialDataset) + // check that raw prediction is coefficients dot features + intercept + results.select("rawPrediction", "features").collect().foreach { + case Row(raw: Vector, features: Vector) => + assert(raw.size === 3) + val margins = Array.tabulate(3) { k => + var margin = 0.0 + features.foreachActive { (index, value) => + margin += value * model.coefficientMatrix(k, index) + } + margin += model.interceptVector(k) + margin + } + assert(raw ~== Vectors.dense(margins) relTol eps) + } + + // Compare rawPrediction with probability + results.select("rawPrediction", "probability").collect().foreach { + case Row(raw: Vector, prob: Vector) => + assert(raw.size === 3) + assert(prob.size === 3) + val max = raw.toArray.max + val subtract = if (max > 0) max else 0.0 + val sum = raw.toArray.map(x => math.exp(x - subtract)).sum + val probFromRaw0 = math.exp(raw(0) - subtract) / sum + val probFromRaw1 = math.exp(raw(1) - subtract) / sum + assert(prob(0) ~== probFromRaw0 relTol eps) + assert(prob(1) ~== probFromRaw1 relTol eps) + assert(prob(2) ~== 1.0 - probFromRaw1 - probFromRaw0 relTol eps) + } + + // Compare prediction with probability + results.select("prediction", "probability").collect().foreach { + case Row(pred: Double, prob: Vector) => + val predFromProb = prob.toArray.zipWithIndex.maxBy(_._1)._2 + assert(pred == predFromProb) + } + + // force it to use probability2prediction + model.setProbabilityCol("") + val resultsUsingProb2Predict = + model.transform(smallMultinomialDataset).select("prediction").as[Double].collect() + resultsUsingProb2Predict.zip(results.select("prediction").as[Double].collect()).foreach { + case (pred1, pred2) => assert(pred1 === pred2) + } + + // force it to use predict + model.setRawPredictionCol("").setProbabilityCol("") + val resultsUsingPredict = + model.transform(smallMultinomialDataset).select("prediction").as[Double].collect() + resultsUsingPredict.zip(results.select("prediction").as[Double].collect()).foreach { + case (pred1, pred2) => assert(pred1 === pred2) + } + } + + test("binary logistic regression: Predictor, Classifier methods") { + val sqlContext = smallBinaryDataset.sqlContext + import sqlContext.implicits._ + val lr = new LogisticRegression().setFamily("binomial") - val model = lr.fit(dataset) + val model = lr.fit(smallBinaryDataset) assert(model.numClasses === 2) - val numFeatures = dataset.select("features").first().getAs[Vector](0).size + val numFeatures = smallBinaryDataset.select("features").first().getAs[Vector](0).size assert(model.numFeatures === numFeatures) - val threshold = model.getThreshold - val results = model.transform(dataset) + val results = model.transform(smallBinaryDataset) // Compare rawPrediction with probability results.select("rawPrediction", "probability").collect().foreach { @@ -230,6 +390,63 @@ class LogisticRegressionSuite val predFromProb = prob.toArray.zipWithIndex.maxBy(_._1)._2 assert(pred == predFromProb) } + + // force it to use probability2prediction + model.setProbabilityCol("") + val resultsUsingProb2Predict = + model.transform(smallBinaryDataset).select("prediction").as[Double].collect() + resultsUsingProb2Predict.zip(results.select("prediction").as[Double].collect()).foreach { + case (pred1, pred2) => assert(pred1 === pred2) + } + + // force it to use predict + model.setRawPredictionCol("").setProbabilityCol("") + val resultsUsingPredict = + model.transform(smallBinaryDataset).select("prediction").as[Double].collect() + resultsUsingPredict.zip(results.select("prediction").as[Double].collect()).foreach { + case (pred1, pred2) => assert(pred1 === pred2) + } + } + + test("coefficients and intercept methods") { + val mlr = new LogisticRegression().setMaxIter(1).setFamily("multinomial") + val mlrModel = mlr.fit(smallMultinomialDataset) + val thrownCoef = intercept[SparkException] { + mlrModel.coefficients + } + val thrownIntercept = intercept[SparkException] { + mlrModel.intercept + } + assert(thrownCoef.getMessage().contains("use coefficientMatrix instead")) + assert(thrownIntercept.getMessage().contains("use interceptVector instead")) + + val blr = new LogisticRegression().setMaxIter(1).setFamily("binomial") + val blrModel = blr.fit(smallBinaryDataset) + assert(blrModel.coefficients.size === 1) + assert(blrModel.intercept !== 0.0) + } + + test("overflow prediction for multiclass") { + val model = new LogisticRegressionModel("mLogReg", + Matrices.dense(3, 2, Array(0.0, 0.0, 0.0, 1.0, 2.0, 3.0)), + Vectors.dense(0.0, 0.0, 0.0), 3, true) + val overFlowData = spark.createDataFrame(Seq( + LabeledPoint(1.0, Vectors.dense(0.0, 1000.0)), + LabeledPoint(1.0, Vectors.dense(0.0, -1.0)) + )) + val results = model.transform(overFlowData).select("rawPrediction", "probability").collect() + + // probabilities are correct when margins have to be adjusted + val raw1 = results(0).getAs[Vector](0) + val prob1 = results(0).getAs[Vector](1) + assert(raw1 === Vectors.dense(1000.0, 2000.0, 3000.0)) + assert(prob1 ~== Vectors.dense(0.0, 0.0, 1.0) absTol eps) + + // probabilities are correct when margins don't have to be adjusted + val raw2 = results(1).getAs[Vector](0) + val prob2 = results(1).getAs[Vector](1) + assert(raw2 === Vectors.dense(-1.0, -2.0, -3.0)) + assert(prob2 ~== Vectors.dense(0.66524096, 0.24472847, 0.09003057) relTol eps) } test("MultiClassSummarizer") { @@ -427,7 +644,9 @@ class LogisticRegressionSuite val coefficientsR2 = Vectors.dense(0.0, 0.0, -0.1665453, 0.0) assert(model2.intercept ~== interceptR2 relTol 1E-2) - assert(model2.coefficients ~= coefficientsR2 absTol 1E-3) + assert(model2.coefficients ~== coefficientsR2 absTol 1E-3) + // TODO: move this to a standalone test of compression after SPARK-17471 + assert(model2.coefficients.isInstanceOf[SparseVector]) } test("binary logistic regression without intercept with L1 regularization") { @@ -768,6 +987,7 @@ class LogisticRegressionSuite assert(model2.coefficients ~= coefficientsTheory absTol 1E-6) /* + TODO: why is this needed? The correctness of L1 regularization is already checked elsewhere Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -792,16 +1012,759 @@ class LogisticRegressionSuite assert(model1.coefficients ~== coefficientsR absTol 1E-6) } + test("multinomial logistic regression with intercept with strong L1 regularization") { + val trainer1 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(1.0).setRegParam(6.0).setStandardization(true) + val trainer2 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(1.0).setRegParam(6.0).setStandardization(false) + + val sqlContext = multinomialDataset.sqlContext + import sqlContext.implicits._ + val model1 = trainer1.fit(multinomialDataset) + val model2 = trainer2.fit(multinomialDataset) + + val histogram = multinomialDataset.as[LabeledPoint].rdd.map(_.label) + .treeAggregate(new MultiClassSummarizer)( + seqOp = (c, v) => (c, v) match { + case (classSummarizer: MultiClassSummarizer, label: Double) => classSummarizer.add(label) + }, + combOp = (c1, c2) => (c1, c2) match { + case (classSummarizer1: MultiClassSummarizer, classSummarizer2: MultiClassSummarizer) => + classSummarizer1.merge(classSummarizer2) + }).histogram + val numFeatures = multinomialDataset.as[LabeledPoint].first().features.size + val numClasses = histogram.length + + /* + For multinomial logistic regression with strong L1 regularization, all the coefficients + will be zeros. As a result, the intercepts will be proportional to the log counts in the + histogram. + {{{ + \exp(b_k) = count_k * \exp(\lambda) + b_k = \log(count_k) * \lambda + }}} + \lambda is a free parameter, so choose the phase \lambda such that the + mean is centered. This yields + {{{ + b_k = \log(count_k) + b_k' = b_k - \mean(b_k) + }}} + */ + val rawInterceptsTheory = histogram.map(c => math.log(c + 1)) // add 1 for smoothing + val rawMean = rawInterceptsTheory.sum / rawInterceptsTheory.length + val interceptsTheory = Vectors.dense(rawInterceptsTheory.map(_ - rawMean)) + val coefficientsTheory = new DenseMatrix(numClasses, numFeatures, + Array.fill[Double](numClasses * numFeatures)(0.0), isTransposed = true) + + assert(model1.interceptVector ~== interceptsTheory relTol 1E-3) + assert(model1.coefficientMatrix ~= coefficientsTheory absTol 1E-6) + + assert(model2.interceptVector ~== interceptsTheory relTol 1E-3) + assert(model2.coefficientMatrix ~= coefficientsTheory absTol 1E-6) + } + + test("multinomial logistic regression with intercept without regularization") { + + val trainer1 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(true).setMaxIter(100) + val trainer2 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(false) + + val model1 = trainer1.fit(multinomialDataset) + val model2 = trainer2.fit(multinomialDataset) + + /* + Using the following R code to load the data and train the model using glmnet package. + > library("glmnet") + > data <- read.csv("path", header=FALSE) + > label = as.factor(data$V1) + > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + > coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0, lambda = 0)) + > coefficients + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -2.24493379 + V2 0.25096771 + V3 -0.03915938 + V4 0.14766639 + V5 0.36810817 + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 0.3778931 + V2 -0.3327489 + V3 0.8893666 + V4 -0.2306948 + V5 -0.4442330 + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 1.86704066 + V2 0.08178121 + V3 -0.85020722 + V4 0.08302840 + V5 0.07612480 + */ + + val coefficientsR = new DenseMatrix(3, 4, Array( + 0.2509677, -0.0391594, 0.1476664, 0.3681082, + -0.3327489, 0.8893666, -0.2306948, -0.4442330, + 0.0817812, -0.8502072, 0.0830284, 0.0761248), isTransposed = true) + val interceptsR = Vectors.dense(-2.2449338, 0.3778931, 1.8670407) + + assert(model1.coefficientMatrix ~== coefficientsR relTol 0.05) + assert(model1.coefficientMatrix.toArray.sum ~== 0.0 absTol eps) + assert(model1.interceptVector ~== interceptsR relTol 0.05) + assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) + assert(model2.coefficientMatrix ~== coefficientsR relTol 0.05) + assert(model2.coefficientMatrix.toArray.sum ~== 0.0 absTol eps) + assert(model2.interceptVector ~== interceptsR relTol 0.05) + assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) + } + + test("multinomial logistic regression without intercept without regularization") { + + val trainer1 = (new LogisticRegression).setFitIntercept(false) + .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(true) + val trainer2 = (new LogisticRegression).setFitIntercept(false) + .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(false) + + val model1 = trainer1.fit(multinomialDataset) + val model2 = trainer2.fit(multinomialDataset) + + /* + Using the following R code to load the data and train the model using glmnet package. + library("glmnet") + data <- read.csv("path", header=FALSE) + label = as.factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0, lambda = 0, + intercept=F)) + > coefficients + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 0.06992464 + V3 -0.36562784 + V4 0.12142680 + V5 0.32052211 + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 -0.3036269 + V3 0.9449630 + V4 -0.2271038 + V5 -0.4364839 + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 0.2337022 + V3 -0.5793351 + V4 0.1056770 + V5 0.1159618 + */ + + val coefficientsR = new DenseMatrix(3, 4, Array( + 0.0699246, -0.3656278, 0.1214268, 0.3205221, + -0.3036269, 0.9449630, -0.2271038, -0.4364839, + 0.2337022, -0.5793351, 0.1056770, 0.1159618), isTransposed = true) + + assert(model1.coefficientMatrix ~== coefficientsR relTol 0.05) + assert(model1.coefficientMatrix.toArray.sum ~== 0.0 absTol eps) + assert(model1.interceptVector.toArray === Array.fill(3)(0.0)) + assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) + assert(model2.coefficientMatrix ~== coefficientsR relTol 0.05) + assert(model2.coefficientMatrix.toArray.sum ~== 0.0 absTol eps) + assert(model2.interceptVector.toArray === Array.fill(3)(0.0)) + assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) + } + + test("multinomial logistic regression with intercept with L1 regularization") { + + // use tighter constraints because OWL-QN solver takes longer to converge + val trainer1 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(true) + .setMaxIter(300).setTol(1e-10) + val trainer2 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(false) + .setMaxIter(300).setTol(1e-10) + + val model1 = trainer1.fit(multinomialDataset) + val model2 = trainer2.fit(multinomialDataset) + + /* + Use the following R code to load the data and train the model using glmnet package. + library("glmnet") + data <- read.csv("path", header=FALSE) + label = as.factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 1, + lambda = 0.05, standardization=T)) + coefficients = coef(glmnet(features, label, family="multinomial", alpha = 1, lambda = 0.05, + standardization=F)) + > coefficientsStd + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -0.68988825 + V2 . + V3 . + V4 . + V5 0.09404023 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -0.2303499 + V2 -0.1232443 + V3 0.3258380 + V4 -0.1564688 + V5 -0.2053965 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 0.9202381 + V2 . + V3 -0.4803856 + V4 . + V5 . + + > coefficients + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -0.44893320 + V2 . + V3 . + V4 0.01933812 + V5 0.03666044 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 0.7376760 + V2 -0.0577182 + V3 . + V4 -0.2081718 + V5 -0.1304592 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -0.2887428 + V2 . + V3 . + V4 . + V5 . + */ + + val coefficientsRStd = new DenseMatrix(3, 4, Array( + 0.0, 0.0, 0.0, 0.09404023, + -0.1232443, 0.3258380, -0.1564688, -0.2053965, + 0.0, -0.4803856, 0.0, 0.0), isTransposed = true) + val interceptsRStd = Vectors.dense(-0.68988825, -0.2303499, 0.9202381) + + val coefficientsR = new DenseMatrix(3, 4, Array( + 0.0, 0.0, 0.01933812, 0.03666044, + -0.0577182, 0.0, -0.2081718, -0.1304592, + 0.0, 0.0, 0.0, 0.0), isTransposed = true) + val interceptsR = Vectors.dense(-0.44893320, 0.7376760, -0.2887428) + + assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.02) + assert(model1.interceptVector ~== interceptsRStd relTol 0.1) + assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) + assert(model2.coefficientMatrix ~== coefficientsR absTol 0.02) + assert(model2.interceptVector ~== interceptsR relTol 0.1) + assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) + } + + test("multinomial logistic regression without intercept with L1 regularization") { + val trainer1 = (new LogisticRegression).setFitIntercept(false) + .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(true) + val trainer2 = (new LogisticRegression).setFitIntercept(false) + .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(false) + + val model1 = trainer1.fit(multinomialDataset) + val model2 = trainer2.fit(multinomialDataset) + /* + Use the following R code to load the data and train the model using glmnet package. + library("glmnet") + data <- read.csv("path", header=FALSE) + label = as.factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 1, + lambda = 0.05, intercept=F, standardization=T)) + coefficients = coef(glmnet(features, label, family="multinomial", alpha = 1, lambda = 0.05, + intercept=F, standardization=F)) + > coefficientsStd + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 . + V3 . + V4 . + V5 0.01525105 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 -0.1502410 + V3 0.5134658 + V4 -0.1601146 + V5 -0.2500232 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 0.003301875 + V3 . + V4 . + V5 . + + > coefficients + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 . + V3 . + V4 . + V5 . + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 . + V3 0.1943624 + V4 -0.1902577 + V5 -0.1028789 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 . + V3 . + V4 . + V5 . + */ + + val coefficientsRStd = new DenseMatrix(3, 4, Array( + 0.0, 0.0, 0.0, 0.01525105, + -0.1502410, 0.5134658, -0.1601146, -0.2500232, + 0.003301875, 0.0, 0.0, 0.0), isTransposed = true) + + val coefficientsR = new DenseMatrix(3, 4, Array( + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.1943624, -0.1902577, -0.1028789, + 0.0, 0.0, 0.0, 0.0), isTransposed = true) + + assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01) + assert(model1.interceptVector.toArray === Array.fill(3)(0.0)) + assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) + assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01) + assert(model2.interceptVector.toArray === Array.fill(3)(0.0)) + assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) + } + + test("multinomial logistic regression with intercept with L2 regularization") { + val trainer1 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(true) + val trainer2 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(false) + + val model1 = trainer1.fit(multinomialDataset) + val model2 = trainer2.fit(multinomialDataset) + /* + Use the following R code to load the data and train the model using glmnet package. + library("glmnet") + data <- read.csv("path", header=FALSE) + label = as.factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0, + lambda = 0.1, intercept=T, standardization=T)) + coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0, + lambda = 0.1, intercept=T, standardization=F)) + > coefficientsStd + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -1.70040424 + V2 0.17576070 + V3 0.01527894 + V4 0.10216108 + V5 0.26099531 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 0.2438590 + V2 -0.2238875 + V3 0.5967610 + V4 -0.1555496 + V5 -0.3010479 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 1.45654525 + V2 0.04812679 + V3 -0.61203992 + V4 0.05338850 + V5 0.04005258 + + > coefficients + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -1.65488543 + V2 0.15715048 + V3 0.01992903 + V4 0.12428858 + V5 0.22130317 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 1.1297533 + V2 -0.1974768 + V3 0.2776373 + V4 -0.1869445 + V5 -0.2510320 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 0.52513212 + V2 0.04032627 + V3 -0.29756637 + V4 0.06265594 + V5 0.02972883 + */ + + val coefficientsRStd = new DenseMatrix(3, 4, Array( + 0.17576070, 0.01527894, 0.10216108, 0.26099531, + -0.2238875, 0.5967610, -0.1555496, -0.3010479, + 0.04812679, -0.61203992, 0.05338850, 0.04005258), isTransposed = true) + val interceptsRStd = Vectors.dense(-1.70040424, 0.2438590, 1.45654525) + + val coefficientsR = new DenseMatrix(3, 4, Array( + 0.15715048, 0.01992903, 0.12428858, 0.22130317, + -0.1974768, 0.2776373, -0.1869445, -0.2510320, + 0.04032627, -0.29756637, 0.06265594, 0.02972883), isTransposed = true) + val interceptsR = Vectors.dense(-1.65488543, 1.1297533, 0.52513212) + + assert(model1.coefficientMatrix ~== coefficientsRStd relTol 0.05) + assert(model1.interceptVector ~== interceptsRStd relTol 0.05) + assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) + assert(model2.coefficientMatrix ~== coefficientsR relTol 0.05) + assert(model2.interceptVector ~== interceptsR relTol 0.05) + assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) + } + + test("multinomial logistic regression without intercept with L2 regularization") { + val trainer1 = (new LogisticRegression).setFitIntercept(false) + .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(true) + val trainer2 = (new LogisticRegression).setFitIntercept(false) + .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(false) + + val model1 = trainer1.fit(multinomialDataset) + val model2 = trainer2.fit(multinomialDataset) + /* + Use the following R code to load the data and train the model using glmnet package. + library("glmnet") + data <- read.csv("path", header=FALSE) + label = as.factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0, + lambda = 0.1, intercept=F, standardization=T)) + coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0, + lambda = 0.1, intercept=F, standardization=F)) + > coefficientsStd + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 0.03904171 + V3 -0.23354322 + V4 0.08288096 + V5 0.22706393 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 -0.2061848 + V3 0.6341398 + V4 -0.1530059 + V5 -0.2958455 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 0.16714312 + V3 -0.40059658 + V4 0.07012496 + V5 0.06878158 + > coefficients + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 -0.005704542 + V3 -0.144466409 + V4 0.092080736 + V5 0.182927657 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 -0.08469036 + V3 0.38996748 + V4 -0.16468436 + V5 -0.22522976 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 0.09039490 + V3 -0.24550107 + V4 0.07260362 + V5 0.04230210 + */ + val coefficientsRStd = new DenseMatrix(3, 4, Array( + 0.03904171, -0.23354322, 0.08288096, 0.2270639, + -0.2061848, 0.6341398, -0.1530059, -0.2958455, + 0.16714312, -0.40059658, 0.07012496, 0.06878158), isTransposed = true) + + val coefficientsR = new DenseMatrix(3, 4, Array( + -0.005704542, -0.144466409, 0.092080736, 0.182927657, + -0.08469036, 0.38996748, -0.16468436, -0.22522976, + 0.0903949, -0.24550107, 0.07260362, 0.0423021), isTransposed = true) + + assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01) + assert(model1.interceptVector.toArray === Array.fill(3)(0.0)) + assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) + assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01) + assert(model2.interceptVector.toArray === Array.fill(3)(0.0)) + assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) + } + + test("multinomial logistic regression with intercept with elasticnet regularization") { + val trainer1 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(true) + .setMaxIter(300).setTol(1e-10) + val trainer2 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(false) + .setMaxIter(300).setTol(1e-10) + + val model1 = trainer1.fit(multinomialDataset) + val model2 = trainer2.fit(multinomialDataset) + /* + Use the following R code to load the data and train the model using glmnet package. + library("glmnet") + data <- read.csv("path", header=FALSE) + label = as.factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0.5, + lambda = 0.1, intercept=T, standardization=T)) + coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0.5, + lambda = 0.1, intercept=T, standardization=F)) + > coefficientsStd + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -0.5521819483 + V2 0.0003092611 + V3 . + V4 . + V5 0.0913818490 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -0.27531989 + V2 -0.09790029 + V3 0.28502034 + V4 -0.12416487 + V5 -0.16513373 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 0.8275018 + V2 . + V3 -0.4044859 + V4 . + V5 . + + > coefficients + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -0.39876213 + V2 . + V3 . + V4 0.02547520 + V5 0.03893991 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 0.61089869 + V2 -0.04224269 + V3 . + V4 -0.18923970 + V5 -0.09104249 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -0.2121366 + V2 . + V3 . + V4 . + V5 . + */ + + val coefficientsRStd = new DenseMatrix(3, 4, Array( + 0.0003092611, 0.0, 0.0, 0.091381849, + -0.09790029, 0.28502034, -0.12416487, -0.16513373, + 0.0, -0.4044859, 0.0, 0.0), isTransposed = true) + val interceptsRStd = Vectors.dense(-0.5521819483, -0.27531989, 0.8275018) + + val coefficientsR = new DenseMatrix(3, 4, Array( + 0.0, 0.0, 0.0254752, 0.03893991, + -0.04224269, 0.0, -0.1892397, -0.09104249, + 0.0, 0.0, 0.0, 0.0), isTransposed = true) + val interceptsR = Vectors.dense(-0.39876213, 0.61089869, -0.2121366) + + assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01) + assert(model1.interceptVector ~== interceptsRStd absTol 0.01) + assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) + assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01) + assert(model2.interceptVector ~== interceptsR absTol 0.01) + assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) + } + + test("multinomial logistic regression without intercept with elasticnet regularization") { + val trainer1 = (new LogisticRegression).setFitIntercept(false) + .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(true) + .setMaxIter(300).setTol(1e-10) + val trainer2 = (new LogisticRegression).setFitIntercept(false) + .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(false) + .setMaxIter(300).setTol(1e-10) + + val model1 = trainer1.fit(multinomialDataset) + val model2 = trainer2.fit(multinomialDataset) + /* + Use the following R code to load the data and train the model using glmnet package. + library("glmnet") + data <- read.csv("path", header=FALSE) + label = as.factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0.5, + lambda = 0.1, intercept=F, standardization=T)) + coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0.5, + lambda = 0.1, intercept=F, standardization=F)) + > coefficientsStd + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 . + V3 . + V4 . + V5 0.03543706 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 -0.1187387 + V3 0.4025482 + V4 -0.1270969 + V5 -0.1918386 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 0.00774365 + V3 . + V4 . + V5 . + + > coefficients + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 . + V3 . + V4 . + V5 . + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 . + V3 0.14666497 + V4 -0.16570638 + V5 -0.05982875 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 . + V3 . + V4 . + V5 . + */ + val coefficientsRStd = new DenseMatrix(3, 4, Array( + 0.0, 0.0, 0.0, 0.03543706, + -0.1187387, 0.4025482, -0.1270969, -0.1918386, + 0.0, 0.0, 0.0, 0.00774365), isTransposed = true) + + val coefficientsR = new DenseMatrix(3, 4, Array( + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.14666497, -0.16570638, -0.05982875, + 0.0, 0.0, 0.0, 0.0), isTransposed = true) + + assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01) + assert(model1.interceptVector.toArray === Array.fill(3)(0.0)) + assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) + assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01) + assert(model2.interceptVector.toArray === Array.fill(3)(0.0)) + assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) + } + test("evaluate on test set") { + // TODO: add for multiclass when model summary becomes available // Evaluate on test set should be same as that of the transformed training data. val lr = new LogisticRegression() .setMaxIter(10) .setRegParam(1.0) .setThreshold(0.6) - val model = lr.fit(dataset) + val model = lr.fit(smallBinaryDataset) val summary = model.summary.asInstanceOf[BinaryLogisticRegressionSummary] - val sameSummary = model.evaluate(dataset).asInstanceOf[BinaryLogisticRegressionSummary] + val sameSummary = + model.evaluate(smallBinaryDataset).asInstanceOf[BinaryLogisticRegressionSummary] assert(summary.areaUnderROC === sameSummary.areaUnderROC) assert(summary.roc.collect() === sameSummary.roc.collect()) assert(summary.pr.collect === sameSummary.pr.collect()) @@ -818,7 +1781,7 @@ class LogisticRegressionSuite .setMaxIter(10) .setRegParam(1.0) .setThreshold(0.6) - val model = lr.fit(dataset) + val model = lr.fit(smallBinaryDataset) assert( model.summary .objectiveHistory @@ -827,67 +1790,118 @@ class LogisticRegressionSuite } - test("binary logistic regression with weighted samples") { - val (dataset, weightedDataset) = { - val nPoints = 1000 - val coefficients = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191) - val xMean = Array(5.843, 3.057, 3.758, 1.199) - val xVariance = Array(0.6856, 0.1899, 3.116, 0.581) - val testData = - generateMultinomialLogisticInput(coefficients, xMean, xVariance, true, nPoints, 42) - - // Let's over-sample the positive samples twice. - val data1 = testData.flatMap { case labeledPoint: LabeledPoint => - if (labeledPoint.label == 1.0) { - Iterator(labeledPoint, labeledPoint) - } else { - Iterator(labeledPoint) - } - } + test("binary logistic regression with weighted data") { + val numClasses = 2 + val numPoints = 40 + val outlierData = MLTestingUtils.genClassificationInstancesWithWeightedOutliers(spark, + numClasses, numPoints) + val testData = spark.createDataFrame(Array.tabulate[LabeledPoint](numClasses) { i => + LabeledPoint(i.toDouble, Vectors.dense(i.toDouble)) + }) + val lr = new LogisticRegression().setFamily("binomial").setWeightCol("weight") + val model = lr.fit(outlierData) + val results = model.transform(testData).select("label", "prediction").collect() + + // check that the predictions are the one to one mapping + results.foreach { case Row(label: Double, pred: Double) => + assert(label === pred) + } + val (overSampledData, weightedData) = + MLTestingUtils.genEquivalentOversampledAndWeightedInstances(outlierData, "label", "features", + 42L) + val weightedModel = lr.fit(weightedData) + val overSampledModel = lr.setWeightCol("").fit(overSampledData) + assert(weightedModel.coefficientMatrix ~== overSampledModel.coefficientMatrix relTol 0.01) + } - val rnd = new Random(8392) - val data2 = testData.flatMap { case LabeledPoint(label: Double, features: Vector) => - if (rnd.nextGaussian() > 0.0) { - if (label == 1.0) { - Iterator( - Instance(label, 1.2, features), - Instance(label, 0.8, features), - Instance(0.0, 0.0, features)) - } else { - Iterator( - Instance(label, 0.3, features), - Instance(1.0, 0.0, features), - Instance(label, 0.1, features), - Instance(label, 0.6, features)) - } - } else { - if (label == 1.0) { - Iterator(Instance(label, 2.0, features)) - } else { - Iterator(Instance(label, 1.0, features)) - } - } - } + test("multinomial logistic regression with weighted data") { + val numClasses = 5 + val numPoints = 40 + val outlierData = MLTestingUtils.genClassificationInstancesWithWeightedOutliers(spark, + numClasses, numPoints) + val testData = spark.createDataFrame(Array.tabulate[LabeledPoint](numClasses) { i => + LabeledPoint(i.toDouble, Vectors.dense(i.toDouble)) + }) + val mlr = new LogisticRegression().setFamily("multinomial").setWeightCol("weight") + val model = mlr.fit(outlierData) + val results = model.transform(testData).select("label", "prediction").collect() + + // check that the predictions are the one to one mapping + results.foreach { case Row(label: Double, pred: Double) => + assert(label === pred) + } + val (overSampledData, weightedData) = + MLTestingUtils.genEquivalentOversampledAndWeightedInstances(outlierData, "label", "features", + 42L) + val weightedModel = mlr.fit(weightedData) + val overSampledModel = mlr.setWeightCol("").fit(overSampledData) + assert(weightedModel.coefficientMatrix ~== overSampledModel.coefficientMatrix relTol 0.01) + } - (spark.createDataFrame(sc.parallelize(data1, 4)), - spark.createDataFrame(sc.parallelize(data2, 4))) + test("set family") { + val lr = new LogisticRegression().setMaxIter(1) + // don't set anything for binary classification + val model1 = lr.fit(binaryDataset) + assert(model1.coefficientMatrix.numRows === 1 && model1.coefficientMatrix.numCols === 4) + assert(model1.interceptVector.size === 1) + + // set to multinomial for binary classification + val model2 = lr.setFamily("multinomial").fit(binaryDataset) + assert(model2.coefficientMatrix.numRows === 2 && model2.coefficientMatrix.numCols === 4) + assert(model2.interceptVector.size === 2) + + // set to binary for binary classification + val model3 = lr.setFamily("binomial").fit(binaryDataset) + assert(model3.coefficientMatrix.numRows === 1 && model3.coefficientMatrix.numCols === 4) + assert(model3.interceptVector.size === 1) + + // don't set anything for multiclass classification + val mlr = new LogisticRegression().setMaxIter(1) + val model4 = mlr.fit(multinomialDataset) + assert(model4.coefficientMatrix.numRows === 3 && model4.coefficientMatrix.numCols === 4) + assert(model4.interceptVector.size === 3) + + // set to binary for multiclass classification + mlr.setFamily("binomial") + val thrown = intercept[IllegalArgumentException] { + mlr.fit(multinomialDataset) } + assert(thrown.getMessage.contains("Binomial family only supports 1 or 2 outcome classes")) - val trainer1a = (new LogisticRegression).setFitIntercept(true) - .setRegParam(0.0).setStandardization(true) - val trainer1b = (new LogisticRegression).setFitIntercept(true).setWeightCol("weight") - .setRegParam(0.0).setStandardization(true) - val model1a0 = trainer1a.fit(dataset) - val model1a1 = trainer1a.fit(weightedDataset) - val model1b = trainer1b.fit(weightedDataset) - assert(model1a0.coefficients !~= model1a1.coefficients absTol 1E-3) - assert(model1a0.intercept !~= model1a1.intercept absTol 1E-3) - assert(model1a0.coefficients ~== model1b.coefficients absTol 1E-3) - assert(model1a0.intercept ~== model1b.intercept absTol 1E-3) + // set to multinomial for multiclass + mlr.setFamily("multinomial") + val model5 = mlr.fit(multinomialDataset) + assert(model5.coefficientMatrix.numRows === 3 && model5.coefficientMatrix.numCols === 4) + assert(model5.interceptVector.size === 3) } - test("logistic regression with all labels the same") { - val sameLabels = dataset + test("set initial model") { + val lr = new LogisticRegression().setFamily("binomial") + val model1 = lr.fit(smallBinaryDataset) + val lr2 = new LogisticRegression().setInitialModel(model1).setMaxIter(5).setFamily("binomial") + val model2 = lr2.fit(smallBinaryDataset) + val predictions1 = model1.transform(smallBinaryDataset).select("prediction").collect() + val predictions2 = model2.transform(smallBinaryDataset).select("prediction").collect() + predictions1.zip(predictions2).foreach { case (Row(p1: Double), Row(p2: Double)) => + assert(p1 === p2) + } + assert(model2.summary.totalIterations === 1) + + val lr3 = new LogisticRegression().setFamily("multinomial") + val model3 = lr3.fit(smallMultinomialDataset) + val lr4 = new LogisticRegression() + .setInitialModel(model3).setMaxIter(5).setFamily("multinomial") + val model4 = lr4.fit(smallMultinomialDataset) + val predictions3 = model3.transform(smallMultinomialDataset).select("prediction").collect() + val predictions4 = model4.transform(smallMultinomialDataset).select("prediction").collect() + predictions3.zip(predictions4).foreach { case (Row(p1: Double), Row(p2: Double)) => + assert(p1 === p2) + } + // TODO: check that it converges in a single iteration when model summary is available + } + + test("binary logistic regression with all labels the same") { + val sameLabels = smallBinaryDataset .withColumn("zeroLabel", lit(0.0)) .withColumn("oneLabel", lit(1.0)) @@ -895,6 +1909,7 @@ class LogisticRegressionSuite val lrIntercept = new LogisticRegression() .setFitIntercept(true) .setMaxIter(3) + .setFamily("binomial") val allZeroInterceptModel = lrIntercept .setLabelCol("zeroLabel") @@ -914,6 +1929,7 @@ class LogisticRegressionSuite val lrNoIntercept = new LogisticRegression() .setFitIntercept(false) .setMaxIter(3) + .setFamily("binomial") val allZeroNoInterceptModel = lrNoIntercept .setLabelCol("zeroLabel") @@ -928,6 +1944,102 @@ class LogisticRegressionSuite assert(allOneNoInterceptModel.summary.totalIterations > 0) } + test("multiclass logistic regression with all labels the same") { + val constantData = spark.createDataFrame(Seq( + LabeledPoint(4.0, Vectors.dense(0.0)), + LabeledPoint(4.0, Vectors.dense(1.0)), + LabeledPoint(4.0, Vectors.dense(2.0))) + ) + val mlr = new LogisticRegression().setFamily("multinomial") + val model = mlr.fit(constantData) + val results = model.transform(constantData) + results.select("rawPrediction", "probability", "prediction").collect().foreach { + case Row(raw: Vector, prob: Vector, pred: Double) => + assert(raw === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, Double.PositiveInfinity))) + assert(prob === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, 1.0))) + assert(pred === 4.0) + } + + // force the model to be trained with only one class + val constantZeroData = spark.createDataFrame(Seq( + LabeledPoint(0.0, Vectors.dense(0.0)), + LabeledPoint(0.0, Vectors.dense(1.0)), + LabeledPoint(0.0, Vectors.dense(2.0))) + ) + val modelZeroLabel = mlr.setFitIntercept(false).fit(constantZeroData) + val resultsZero = modelZeroLabel.transform(constantZeroData) + resultsZero.select("rawPrediction", "probability", "prediction").collect().foreach { + case Row(raw: Vector, prob: Vector, pred: Double) => + assert(prob === Vectors.dense(Array(1.0))) + assert(pred === 0.0) + } + + // ensure that the correct value is predicted when numClasses passed through metadata + val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(6).toMetadata() + val constantDataWithMetadata = constantData + .select(constantData("label").as("label", labelMeta), constantData("features")) + val modelWithMetadata = mlr.setFitIntercept(true).fit(constantDataWithMetadata) + val resultsWithMetadata = modelWithMetadata.transform(constantDataWithMetadata) + resultsWithMetadata.select("rawPrediction", "probability", "prediction").collect().foreach { + case Row(raw: Vector, prob: Vector, pred: Double) => + assert(raw === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, Double.PositiveInfinity, 0.0))) + assert(prob === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, 1.0, 0.0))) + assert(pred === 4.0) + } + // TODO: check num iters is zero when it become available in the model + } + + test("compressed storage") { + val moreClassesThanFeatures = spark.createDataFrame(Seq( + LabeledPoint(4.0, Vectors.dense(0.0, 0.0, 0.0)), + LabeledPoint(4.0, Vectors.dense(1.0, 1.0, 1.0)), + LabeledPoint(4.0, Vectors.dense(2.0, 2.0, 2.0))) + ) + val mlr = new LogisticRegression().setFamily("multinomial") + val model = mlr.fit(moreClassesThanFeatures) + assert(model.coefficientMatrix.isInstanceOf[SparseMatrix]) + assert(model.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 4) + val moreFeaturesThanClasses = spark.createDataFrame(Seq( + LabeledPoint(1.0, Vectors.dense(0.0, 0.0, 0.0)), + LabeledPoint(1.0, Vectors.dense(1.0, 1.0, 1.0)), + LabeledPoint(1.0, Vectors.dense(2.0, 2.0, 2.0))) + ) + val model2 = mlr.fit(moreFeaturesThanClasses) + assert(model2.coefficientMatrix.isInstanceOf[SparseMatrix]) + assert(model2.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 3) + + val blr = new LogisticRegression().setFamily("binomial") + val blrModel = blr.fit(moreFeaturesThanClasses) + assert(blrModel.coefficientMatrix.isInstanceOf[SparseMatrix]) + assert(blrModel.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 2) + } + + test("numClasses specified in metadata/inferred") { + val lr = new LogisticRegression().setMaxIter(1).setFamily("multinomial") + + // specify more classes than unique label values + val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(4).toMetadata() + val df = smallMultinomialDataset.select(smallMultinomialDataset("label").as("label", labelMeta), + smallMultinomialDataset("features")) + val model1 = lr.fit(df) + assert(model1.numClasses === 4) + assert(model1.interceptVector.size === 4) + + // specify two classes when there are really three + val labelMeta1 = NominalAttribute.defaultAttr.withName("label").withNumValues(2).toMetadata() + val df1 = smallMultinomialDataset + .select(smallMultinomialDataset("label").as("label", labelMeta1), + smallMultinomialDataset("features")) + val thrown = intercept[IllegalArgumentException] { + lr.fit(df1) + } + assert(thrown.getMessage.contains("less than the number of unique labels")) + + // lr should infer the number of classes if not specified + val model3 = lr.fit(smallMultinomialDataset) + assert(model3.numClasses === 3) + } + test("read/write") { def checkModelData(model: LogisticRegressionModel, model2: LogisticRegressionModel): Unit = { assert(model.intercept === model2.intercept) @@ -936,7 +2048,7 @@ class LogisticRegressionSuite assert(model.numFeatures === model2.numFeatures) } val lr = new LogisticRegression() - testEstimatorAndModelReadWrite(lr, dataset, LogisticRegressionSuite.allParamSettings, + testEstimatorAndModelReadWrite(lr, smallBinaryDataset, LogisticRegressionSuite.allParamSettings, checkModelData) }
--------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org