Repository: mahout Updated Branches: refs/heads/master 7c0babd79 -> 9b4eabb9c
MAHOUT-1962 Add Ftest closes apache/mahout#300 Project: http://git-wip-us.apache.org/repos/asf/mahout/repo Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/cf5a33d5 Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/cf5a33d5 Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/cf5a33d5 Branch: refs/heads/master Commit: cf5a33d565f6602f6624bc4d34c62c048379140f Parents: 7c0babd Author: dustinvanstee <[email protected]> Authored: Sat May 20 22:19:32 2017 -0500 Committer: rawkintrevo <[email protected]> Committed: Sat May 20 22:19:32 2017 -0500 ---------------------------------------------------------------------- .../regression/LinearRegressorModel.scala | 71 ++++++++++++++--- .../regression/OrdinaryLeastSquaresModel.scala | 1 + .../algorithms/regression/RegressorModel.scala | 6 ++ .../regression/tests/FittnessTests.scala | 83 +++++++++++++++++++- .../algorithms/RegressionTestsSuiteBase.scala | 23 +++++- 5 files changed, 166 insertions(+), 18 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mahout/blob/cf5a33d5/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala index 84f50ed..7b87a1a 100644 --- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala @@ -26,6 +26,8 @@ import org.apache.mahout.math.drm.RLikeDrmOps._ import org.apache.mahout.math.scalabindings.dvec import org.apache.mahout.math.{Matrix, Vector => MahoutVector} import org.apache.mahout.math.scalabindings.RLikeOps._ +import org.apache.commons.math3.distribution._ + import scala.language.higherKinds trait LinearRegressorModel[K] extends RegressorModel[K] { @@ -34,7 +36,8 @@ trait LinearRegressorModel[K] extends RegressorModel[K] { var se: MahoutVector = _ var tScore: MahoutVector = _ var pval: MahoutVector = _ - var degreesFreedom: Int = _ + + } @@ -54,46 +57,59 @@ trait LinearRegressorFitter[K] extends RegressorFitter[K] { addIntercept = hyperparameters.asInstanceOf[Map[Symbol, Boolean]].getOrElse('addIntercept, true) } + def calculateStandardError[M[K] <: LinearRegressorModel[K]](X: DrmLike[K], drmTarget: DrmLike[K], drmXtXinv: Matrix, model: M[K]): M[K] = { import org.apache.mahout.math.function.Functions.SQRT import org.apache.mahout.math.scalabindings.MahoutCollections._ - var modelOut = model + val yhat = X %*% model.beta val residuals = drmTarget - yhat - val ete = (residuals.t %*% residuals).collect // 1x1 + + // Setting modelOut.rss + // Changed name from ete, to rssModel. This is residual sum of squares for model of yhat vs y + var modelOut = FittnessTests.calculateResidualSumOfSquares(model,residuals) + val n = drmTarget.nrow val k = safeToNonNegInt(X.ncol) val invDegFreedomKindOf = 1.0 / (n - k) - val varCovarMatrix = invDegFreedomKindOf * ete(0,0) * drmXtXinv + val varCovarMatrix = invDegFreedomKindOf * modelOut.rss * drmXtXinv val se = varCovarMatrix.viewDiagonal.assign(SQRT) val tScore = model.beta / se - val tDist = new org.apache.commons.math3.distribution.TDistribution(n-k) + val tDist = new TDistribution(n-k) + val pval = dvec(tScore.toArray.map(t => 2 * (1.0 - tDist.cumulativeProbability(Math.abs(t))) )) + // ^^ TODO bug in this calculation- fix and add test //degreesFreedom = k - - modelOut.se = se modelOut.tScore = tScore modelOut.pval = pval - modelOut.degreesFreedom = X.ncol - modelOut.summary = generateSummaryString(modelOut) + modelOut.degreesOfFreedom = safeToNonNegInt(X.ncol) + modelOut.trainingExamples = safeToNonNegInt(n) + if (calcCommonStatistics){ modelOut = calculateCommonStatistics(modelOut, drmTarget, residuals) } + + // Let Statistics Get Calculated prior to assigning the summary + modelOut.summary = generateSummaryString(modelOut) + modelOut } + def calculateCommonStatistics[M[K] <: LinearRegressorModel[K]](model: M[K], drmTarget: DrmLike[K], residuals: DrmLike[K]): M[K] ={ var modelOut = model modelOut = FittnessTests.CoefficientOfDetermination(model, drmTarget, residuals) modelOut = FittnessTests.MeanSquareError(model, residuals) - modelOut.summary += s"MSE: ${modelOut.mse}\nR2: ${modelOut.r2}\n" + modelOut = FittnessTests.FTest(model, drmTarget) + + modelOut } @@ -109,6 +125,8 @@ trait LinearRegressorFitter[K] extends RegressorFitter[K] { (0 until X.ncol).map(i => s"X${i}\t${modelOut.beta(i)}").mkString("\n") if (calcCommonStatistics) { // we do this in calcStandard errors to avoid calculating residuals twice val residuals = drmTarget - (X %*% modelOut.beta) + // If rss is already set, then this will drop through to calculateCommonStatistics + modelOut = FittnessTests.calculateResidualSumOfSquares(modelOut,residuals) modelOut = calculateCommonStatistics(modelOut, drmTarget, residuals) } @@ -123,9 +141,38 @@ trait LinearRegressorFitter[K] extends RegressorFitter[K] { } def generateSummaryString[M[K] <: LinearRegressorModel[K]](model: M[K]): String = { + + /* Model after R implementation ... + Call: + lm(formula = target ~ a + b + c + d, data = df1) + + Residuals: + 1 2 3 4 5 6 7 8 9 + -4.2799 0.5059 -2.2783 4.3765 -1.3455 0.7202 -1.8063 1.2889 2.8184 + + Coefficients: + Estimate Std. Error t value Pr(>|t|) + (Intercept) 163.179 51.915 3.143 0.0347 * + a -1.336 2.688 -0.497 0.6452 + b -13.158 5.394 -2.439 0.0713 . + c -4.153 1.785 -2.327 0.0806 . + d -5.680 1.887 -3.010 0.0395 * + --- + Signif. codes: 0 â***â 0.001 â**â 0.01 â*â 0.05 â.â 0.1 â â 1 + */ + val k = model.beta.length - "Coef.\t\tEstimate\t\tStd. Error\t\tt-score\t\t\tPr(Beta=0)\n" + - (0 until k).map(i => s"X${i}\t${model.beta(i)}\t${model.se(i)}\t${model.tScore(i)}\t${model.pval(i)}").mkString("\n") + // Using Formatted Print here to pretty print the columns + var summaryString = "\nCoef.\t\tEstimate\t\tStd. Error\t\tt-score\t\t\tPr(Beta=0)\n" + + (0 until k).map(i => "X%-3d\t\t%+5.5f\t\t%+5.5f\t\t%+5.5f\t\t%+5.5f".format(i,model.beta(i),model.se(i),model.tScore(i),model.pval(i))).mkString("\n") + if(calcCommonStatistics) { + summaryString += "\nF-statistic: " + model.fScore + " on " + (model.degreesOfFreedom - 1) + " and " + + (model.trainingExamples - model.degreesOfFreedom) + " DF, p-value: " + 0.009545 + "\n" + summaryString += s"\nMean Squared Error: ${model.mse}" + summaryString += s"\nR^2: ${model.r2}" + + } + summaryString } } http://git-wip-us.apache.org/repos/asf/mahout/blob/cf5a33d5/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala index 58eb45d..fd9924e 100644 --- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala @@ -55,6 +55,7 @@ class OrdinaryLeastSquares[K] extends LinearRegressorFitter[K] { } var X = drmFeatures + if (addIntercept) { X = X cbind 1 } http://git-wip-us.apache.org/repos/asf/mahout/blob/cf5a33d5/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala index 8a4b7d2..aa3dad4 100644 --- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala @@ -30,6 +30,12 @@ trait RegressorModel[K] extends SupervisedModel[K] { // Common Applicable Tests- here only for convenience. var mse: Double = _ var r2: Double = _ + var fpval: Double = _ + // default rss to a negative number to ensure rss gets set. + var rss:Double = -9999.0 + var fScore: Double = _ + var degreesOfFreedom: Int = _ + var trainingExamples :Int = _ /** * Syntatictic sugar for fetching test results. Will Return test result if it exists, otherwise None http://git-wip-us.apache.org/repos/asf/mahout/blob/cf5a33d5/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/FittnessTests.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/FittnessTests.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/FittnessTests.scala index d1dd3bb..c2d634b 100644 --- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/FittnessTests.scala +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/FittnessTests.scala @@ -19,14 +19,19 @@ package org.apache.mahout.math.algorithms.regression.tests + + + + +import org.apache.commons.math3.distribution.FDistribution import org.apache.mahout.math.algorithms.regression.RegressorModel import org.apache.mahout.math.algorithms.preprocessing.MeanCenter import org.apache.mahout.math.drm.DrmLike import org.apache.mahout.math.function.Functions.SQUARE import org.apache.mahout.math.scalabindings.RLikeOps._ +import org.apache.mahout.math.drm.RLikeDrmOps._ import scala.language.higherKinds -import scala.reflect.ClassTag object FittnessTests { @@ -41,16 +46,88 @@ object FittnessTests { val r2 = 1 - (sumSquareResiduals / sumSquareTotal) model.r2 = r2 model.testResults += ('r2 -> r2) // need setResult and setSummary method incase you change in future, also to initialize map if non exists or update value if it does - model.summary += s"\nR^2: ${r2}" + //model.summary += s"\nR^2: ${r2}" model } // https://en.wikipedia.org/wiki/Mean_squared_error def MeanSquareError[R[K] <: RegressorModel[K], K](model: R[K], residuals: DrmLike[K]): R[K] = { + // TODO : I think mse denom should be (row - col) ?? <-- https://en.wikipedia.org/wiki/Mean_squared_error see regression section val mse = residuals.assign(SQUARE).sum / residuals.nrow model.mse = mse model.testResults += ('mse -> mse) - model.summary += s"\nMean Squared Error: ${mse}" + //model.summary += s"\nMean Squared Error: ${mse}" + model + } + + // Since rss is needed for multiple test statistics, use this function to cache this value + def calculateResidualSumOfSquares[R[K] <: RegressorModel[K], K](model: R[K],residuals: DrmLike[K]) : R[K] = { + // This is a check so that model.rss isnt unnecessarily computed + // by default setting this value to negative, so that the first time its garaunteed to evaluate. + if (model.rss < 0) { + val ete = (residuals.t %*% residuals).collect // 1x1 + model.rss = ete(0, 0) + } + model + } + + + // https://en.wikipedia.org/wiki/F-test + /* + # R Prototype + # Cereal Dataframe + df1 <- data.frame( + "X0" = c(1,1,1,1,1,1,1,1,1), + "a" = c(2,1,1,2,1,2,6,3,3), + "b" = c( 2,2,1,1,2,1,2,2,3), + "c" = c( 10.5,12,12, 11,12, 16,17, 13,13), + "d" = c( 10,12,13,13,11,8, 1, 7, 4), + "target" = c( 29.509541,18.042851,22.736446,32.207582,21.871292,36.187559,50.764999,40.400208,45.811716)) + + # Create linear regression models adding features one by one + lrfit0 <- lm(data=df1, formula = target ~ 1 ) + lrfit1 <- lm(data=df1, formula = target ~ a ) + lrfit2 <- lm(data=df1, formula = target ~ a + b ) + lrfit3 <- lm(data=df1, formula = target ~ a + b + c ) + lrfit4 <- lm(data=df1, formula = target ~ a + b + c + d) + + ###################################### + # Fscore Calculation + ###################################### + + # So in the anova report using lm ... + # These are the residual sum of squares for each model + rssint <- sum(lrfit0$residuals^2) + rssa <- sum(lrfit1$residuals^2) + rssb <- sum(lrfit2$residuals^2) + rssc <- sum(lrfit3$residuals^2) + rssd <- sum(lrfit4$residuals^2) + + #Ftest in overall model + (rssint - rssd)/4 / (rssd/4) # g = 4, n - g - 1 = 4 + # Compare with R + summary(lrfit4) + + */ + def FTest[R[K] <: RegressorModel[K], K](model: R[K] , drmTarget: DrmLike[K]): R[K] = { + + val targetMean: Double = drmTarget.colMeans().get(0) + + // rssint is the Residual Sum of Squares for model using only based on the intercept + val rssint: Double = ((drmTarget - targetMean ).t %*% (drmTarget - targetMean)).zSum() + // K-1 is model.degreesOfFreedom-1 + // N-K is model.trainingExamples - model.degreesOfFreedom + + val fScore = ((rssint - model.rss) / (model.degreesOfFreedom-1) / ( model.rss / (model.trainingExamples - model.degreesOfFreedom))) + val fDist = new FDistribution(model.degreesOfFreedom-1,model.trainingExamples-model.degreesOfFreedom) + val fpval = 1.0 - fDist.cumulativeProbability(fScore) + model.fpval = fpval + + model.fScore = fScore + model.testResults += ('fScore -> fScore) + //model.summary += s"\nFscore : ${fScore}" model } + + } http://git-wip-us.apache.org/repos/asf/mahout/blob/cf5a33d5/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuiteBase.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuiteBase.scala index 55967e1..57dffef 100644 --- a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuiteBase.scala +++ b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuiteBase.scala @@ -21,7 +21,7 @@ package org.apache.mahout.math.algorithms import org.apache.mahout.math.algorithms.regression.OrdinaryLeastSquares import org.apache.mahout.math.algorithms.regression.tests._ -import org.apache.mahout.math.drm.drmParallelize +import org.apache.mahout.math.drm.{CheckpointedDrm, drmParallelize} import org.apache.mahout.math.drm.RLikeDrmOps._ import org.apache.mahout.math.scalabindings.{`::`, dense} import org.apache.mahout.test.DistributedMahoutSuite @@ -78,11 +78,26 @@ trait RegressionTestsSuiteBase extends DistributedMahoutSuite with Matchers { val r2: Double = model.r2 val mse: Double = model.mse - println("R2: " + r2) - println("MSE: " + mse) (rR2 - r2) should be < epsilon (rMSE - mse) should be < epsilon + Math.abs(model.beta.get(4) - 163.17933 ) should be < epsilon + Math.abs(model.beta.get(0) - (-1.33627) ) should be < epsilon + Math.abs(model.beta.get(1) - (-13.15770)) should be < epsilon + Math.abs(model.beta.get(2) - (-4.15265) ) should be < epsilon + Math.abs(model.beta.get(3) - (-5.679908)) should be < epsilon + + Math.abs(model.tScore.get(0) - (-0.49715717)) should be < epsilon + Math.abs(model.tScore.get(1) - (-2.43932888)) should be < epsilon + Math.abs(model.tScore.get(2) - (-2.32654000)) should be < epsilon + Math.abs(model.tScore.get(3) - (-3.01022444)) should be < epsilon + Math.abs(model.tScore.get(4) - 3.143183937 ) should be < epsilon + + model.degreesOfFreedom should equal(5) + model.trainingExamples should equal(9) + + Math.abs((model.fScore - 16.38542361)) should be < 0.0000001 + } test("durbinWatsonTest test") { @@ -106,4 +121,6 @@ trait RegressionTestsSuiteBase extends DistributedMahoutSuite with Matchers { (myAnswer - correctAnswer) should be < epsilon } + } +
