Repository: mahout Updated Branches: refs/heads/master 9b3c48ef7 -> d848625df (forced update)
[MAHOUT-1924][MAHOUT-1925] Add DW and Unit Tests to CochraneOrcutt closes apache/mahout#287 Project: http://git-wip-us.apache.org/repos/asf/mahout/repo Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/d848625d Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/d848625d Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/d848625d Branch: refs/heads/master Commit: d848625df7144ef14dd90300adcf253f08aa28e4 Parents: eb278dc Author: rawkintrevo <[email protected]> Authored: Sun Feb 26 22:48:30 2017 -0600 Committer: rawkintrevo <[email protected]> Committed: Sun Feb 26 22:55:35 2017 -0600 ---------------------------------------------------------------------- .../regression/CochraneOrcuttModel.scala | 109 ++++++++++++++----- .../regression/LinearRegressorModel.scala | 15 ++- .../regression/OrdinaryLeastSquaresModel.scala | 7 +- .../algorithms/regression/RegressorModel.scala | 2 + .../math/algorithms/RegressionSuiteBase.scala | 101 ++++++++++++++++- 5 files changed, 199 insertions(+), 35 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mahout/blob/d848625d/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/CochraneOrcuttModel.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/CochraneOrcuttModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/CochraneOrcuttModel.scala index 844e72f..3e5a496 100644 --- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/CochraneOrcuttModel.scala +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/CochraneOrcuttModel.scala @@ -19,15 +19,20 @@ package org.apache.mahout.math.algorithms.regression -import org.apache.mahout.math.{Vector => MahoutVector} +import org.apache.mahout.math.algorithms.regression.tests._ import org.apache.mahout.math.drm.{CacheHint, DrmLike, safeToNonNegInt} import org.apache.mahout.math.drm.RLikeDrmOps._ +import org.apache.mahout.math.function.Functions import org.apache.mahout.math.scalabindings.RLikeOps._ +import org.apache.mahout.math.{Vector => MahoutVector} + class CochraneOrcuttModel[K](regressor: LinearRegressorModel[K]) extends LinearRegressorModel[K] { // https://en.wikipedia.org/wiki/Cochrane%E2%80%93Orcutt_estimation var betas: Array[MahoutVector] = _ + var dws: Array[Double] = _ + var rhos: Array[Double] = _ def predict(drmPredictors: DrmLike[K]): DrmLike[K] = { regressor.predict(drmPredictors) @@ -37,10 +42,9 @@ class CochraneOrcuttModel[K](regressor: LinearRegressorModel[K]) extends LinearR class CochraneOrcutt[K](hyperparameters: (Symbol, Any)*) extends LinearRegressorFitter[K] { - var regressor: LinearRegressorFitter[K] = hyperparameters.asInstanceOf[Map[Symbol, - LinearRegressorFitter[K]]].getOrElse('regressor, new OrdinaryLeastSquares[K]()) - var iterations: Int = hyperparameters.asInstanceOf[Map[Symbol, Int]].getOrElse('iterations, 3) - var cacheHint: CacheHint.CacheHint = hyperparameters.asInstanceOf[Map[Symbol, CacheHint.CacheHint]].getOrElse('cacheHint, CacheHint.MEMORY_ONLY) + var regressor: LinearRegressorFitter[K] = _ + var iterations: Int = _ + var cacheHint: CacheHint.CacheHint = _ // For larger inputs, CacheHint.MEMORY_AND_DISK2 is reccomended. def setHyperparameters(hyperparameters: Map[Symbol, Any] = Map('foo -> None)): Unit = { @@ -54,47 +58,94 @@ class CochraneOrcutt[K](hyperparameters: (Symbol, Any)*) extends LinearRegresso setHyperparameters(hyperparameters.toMap) + def calculateRho(errorDrm: DrmLike[K]): Double ={ + val error = errorDrm.collect.viewColumn(0) + val n = error.length - 1 + val e2: MahoutVector = error.viewPart(1, n) + val e3: MahoutVector = error.viewPart(0, n) + // regression through the origin lm(e2 ~e3 -1) is sum(e2 * e3) / e3^2 + e3.times(e2).sum / e3.assign(Functions.SQUARE).sum + } + def fit(drmFeatures: DrmLike[K], drmTarget: DrmLike[K], hyperparameters: (Symbol, Any)*): CochraneOrcuttModel[K] = { - var hyperparameters: Option[Map[String,Any]] = None + setHyperparameters(hyperparameters.toMap[Symbol, Any]) + val betas = new Array[MahoutVector](iterations) - var regressionModel: LinearRegressorModel[K] = regressor.fit(drmFeatures, drmTarget) - betas(0) = regressionModel.beta - // todo add dw test option on each iteration + val models = new Array[LinearRegressorModel[K]](iterations) + val dws = new Array[Double](iterations) + val rhos = new Array[Double](iterations) - val drmY = drmTarget val n = safeToNonNegInt(drmTarget.nrow) val Y = drmTarget(1 until n, 0 until 1).checkpoint(cacheHint) val Y_lag = drmTarget(0 until n - 1, 0 until 1).checkpoint(cacheHint) - val X = drmFeatures(1 until n, 0 until 1).checkpoint(cacheHint) - val X_lag = drmFeatures(0 until n - 1, 0 until 1).checkpoint(cacheHint) + val X = drmFeatures(1 until n, 0 until drmFeatures.ncol).checkpoint(cacheHint) + val X_lag = drmFeatures(0 until n - 1, 0 until drmFeatures.ncol).checkpoint(cacheHint) + + // Step 1: Normal Regression + regressor.calcStandardErrors = true + regressor.calcCommonStatistics = true + models(0) = regressor.fit(drmFeatures, drmTarget) + regressor.calcStandardErrors = false + regressor.calcCommonStatistics = false + betas(0) = models(0).beta + var residuals = drmTarget - models(0).predict(drmFeatures) + for (i <- 1 until iterations){ - val error = drmTarget - regressionModel.predict(drmFeatures) - regressionModel = regressor.fit(drmFeatures, drmTarget) - val rho = regressionModel.beta.get(0) + // Step 2: Calculate Rho + val rho_hat = calculateRho(residuals) + rhos(i-1) = rho_hat - val drmYprime = Y - Y_lag * rho - val drmXprime = X - X_lag * rho + // Step 3: Transform Variables + val drmYprime = Y - (Y_lag * rho_hat) + val drmXprime = X - (X_lag * rho_hat) + // Step 4: Get Estimates of Transformed Equation if (i == iterations - 1 ){ - // calculate common stats and SE on last iteration only - // todo make this optional- but if you don't care then why are you even bothering to do this? + // get standard errors on last iteration only regressor.calcStandardErrors = true regressor.calcCommonStatistics = true } - regressionModel = regressor.fit(drmFeatures, drmTarget) - var betaPrime = regressionModel.beta - val b0 = betaPrime(0) / (1 - rho) - betaPrime(0) = b0 - betas(i) = betaPrime + models(i) = regressor.fit(drmXprime, drmYprime) + // Make this optional- only for parity with R reported dw-stat, doesn't really mean anything + dws(i) = AutocorrelationTests.DurbinWatson( models(i), + drmTarget - models(i).predict(drmFeatures)) + .testResults.get('durbinWatsonTestStatistic).get.asInstanceOf[Double] + + models(i).beta(X.ncol) = models(i).beta(X.ncol) / (1 - rho_hat) // intercept adjust + betas(i) = models(i).beta + + // Step 5: Use Betas from (4) to recalculate model from (1) + residuals = drmTarget - models(i).predict(drmFeatures) + + /** Step 6: repeat Step 2 through 5 until a stopping criteria is met. + * some models call for convergence- + * Kunter et. al reccomend 3 iterations, if you don't achieve desired results, use + * an alternative method. + **/ } - val model = new CochraneOrcuttModel[K](regressionModel) - model.betas = betas - model.summary = (0 until iterations).map(i â s"Beta estimates on iteration " + i + ": " - + model.betas.toString + "\n").mkString("") + "\n\n" + "Final Model:\n\n" + regressionModel.summary + var finalModel = new CochraneOrcuttModel[K](models(iterations -1)) + finalModel.betas = betas + finalModel.dws = dws + finalModel.rhos = rhos + finalModel.tScore = models(iterations -1).tScore + finalModel.pval = models(iterations -1).pval + finalModel.beta = models(iterations -1).beta + val se = models(iterations -1).se + se(se.length -1) = se(se.length -1) / (1 - rhos(iterations - 2)) + finalModel.se = se + finalModel.summary = "Original Model:\n" + models(0).summary + + "\n\nTransformed Model:\n" + + generateSummaryString(finalModel) + + "\n\nfinal rho: " + finalModel.rhos(iterations - 2) + + s"\nMSE: ${models(iterations -1 ).mse}\nR2: ${models(iterations -1 ).r2}\n" + + if (models(0).addIntercept == true){ + finalModel.summary = finalModel.summary.replace(s"X${X.ncol}", "(Intercept)") + } - model + finalModel } } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/mahout/blob/d848625d/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala index 555ee6c..2583795 100644 --- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala @@ -40,7 +40,6 @@ trait LinearRegressorModel[K] extends RegressorModel[K] { trait LinearRegressorFitter[K] extends RegressorFitter[K] { - var addIntercept: Boolean = _ var calcStandardErrors: Boolean = _ var calcCommonStatistics: Boolean = _ @@ -75,14 +74,13 @@ trait LinearRegressorFitter[K] extends RegressorFitter[K] { val pval = dvec(tScore.toArray.map(t => 2 * (1.0 - tDist.cumulativeProbability(t)) )) // ^^ TODO bug in this calculation- fix and add test //degreesFreedom = k - modelOut.summary = "Coef.\t\tEstimate\t\tStd. Error\t\tt-score\t\t\tPr(Beta=0)\n" + - (0 until k).map(i => s"X${i}\t${model.beta(i)}\t${se(i)}\t${tScore(i)}\t${pval(i)}").mkString("\n") + modelOut.se = se modelOut.tScore = tScore modelOut.pval = pval modelOut.degreesFreedom = X.ncol - + modelOut.summary = generateSummaryString(modelOut) if (calcCommonStatistics){ modelOut = calculateCommonStatistics(modelOut, drmTarget, residuals) } @@ -95,6 +93,7 @@ trait LinearRegressorFitter[K] extends RegressorFitter[K] { var modelOut = model modelOut = FittnessTests.CoefficientOfDetermination(model, drmTarget, residuals) modelOut = FittnessTests.MeanSquareError(model, residuals) + modelOut.summary += s"MSE: ${modelOut.mse}\nR2: ${modelOut.r2}\n" modelOut } @@ -118,7 +117,15 @@ trait LinearRegressorFitter[K] extends RegressorFitter[K] { if (addIntercept) { model.summary.replace(s"X${X.ncol - 1}", "(Intercept)") + model.addIntercept = true } model } + + def generateSummaryString[M[K] <: LinearRegressorModel[K]](model: M[K]): String = { + val k = model.beta.length + "Coef.\t\tEstimate\t\tStd. Error\t\tt-score\t\t\tPr(Beta=0)\n" + + (0 until k).map(i => s"X${i}\t${model.beta(i)}\t${model.se(i)}\t${model.tScore(i)}\t${model.pval(i)}").mkString("\n") + + } } http://git-wip-us.apache.org/repos/asf/mahout/blob/d848625d/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala index 682cf1c..58eb45d 100644 --- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala @@ -29,7 +29,11 @@ class OrdinaryLeastSquaresModel[K] // https://en.wikipedia.org/wiki/Ordinary_least_squares def predict(drmPredictors: DrmLike[K]): DrmLike[K] = { - drmPredictors %*% beta + var X = drmPredictors + if (addIntercept) { + X = X cbind 1 + } + X %*% beta } } @@ -41,6 +45,7 @@ class OrdinaryLeastSquares[K] extends LinearRegressorFitter[K] { drmTarget: DrmLike[K], hyperparameters: (Symbol, Any)*): OrdinaryLeastSquaresModel[K] = { + assert(drmTarget.ncol == 1, s"drmTarget must be a single column matrix, found ${drmTarget.ncol} columns") var model = new OrdinaryLeastSquaresModel[K]() setStandardHyperparameters(hyperparameters.toMap) http://git-wip-us.apache.org/repos/asf/mahout/blob/d848625d/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala index bdddb29..8a4b7d2 100644 --- a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala @@ -26,6 +26,7 @@ trait RegressorModel[K] extends SupervisedModel[K] { def predict(drmPredictors: DrmLike[K]): DrmLike[K] + var addIntercept: Boolean = _ // Common Applicable Tests- here only for convenience. var mse: Double = _ var r2: Double = _ @@ -43,6 +44,7 @@ trait RegressorModel[K] extends SupervisedModel[K] { trait RegressorFitter[K] extends SupervisedFitter[K, RegressorModel[K]] { + var addIntercept: Boolean = _ def fitPredict(drmX: DrmLike[K], drmTarget: DrmLike[K], http://git-wip-us.apache.org/repos/asf/mahout/blob/d848625d/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuiteBase.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuiteBase.scala index 2bb0343..8910ae9 100644 --- a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuiteBase.scala +++ b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuiteBase.scala @@ -17,7 +17,7 @@ package org.apache.mahout.math.algorithms -import org.apache.mahout.math.algorithms.regression.OrdinaryLeastSquares +import org.apache.mahout.math.algorithms.regression._ import org.apache.mahout.math.drm._ import org.apache.mahout.math.drm.RLikeDrmOps._ import org.apache.mahout.math.scalabindings._ @@ -28,6 +28,8 @@ import org.scalatest.{FunSuite, Matchers} trait RegressionSuiteBase extends DistributedMahoutSuite with Matchers { this: FunSuite => + val epsilon = 1E-6 + test("ordinary least squares") { /* R Prototype: @@ -76,6 +78,103 @@ trait RegressionSuiteBase extends DistributedMahoutSuite with Matchers { // TODO add test for S.E / pvalue } + test("cochrane-orcutt"){ + /* R Prototype: + library(orcutt) + + df = data.frame(t(data.frame( + c(20.96, 127.3), + c(21.40, 130.0), + c(21.96, 132.7), + c(21.52, 129.4), + c(22.39, 135.0), + c(22.76, 137.1), + c(23.48, 141.2), + c(23.66, 142.8), + c(24.10, 145.5), + c(24.01, 145.3), + c(24.54, 148.3), + c(24.30, 146.4), + c(25.00, 150.2), + c(25.64, 153.1), + c(26.36, 157.3), + c(26.98, 160.7), + c(27.52, 164.2), + c(27.78, 165.6), + c(28.24, 168.7), + c(28.78, 171.7)))) + + rownames(df) <- NULL + colnames(df) <- c("y", "x") + my_lm = lm(y ~ x, data=df) + coch = cochrane.orcutt(my_lm) + + /////////////////////////////////////// + The R-implementation is kind of...silly. + + The above works- converges at 318 iterations- the transformed DW is 1.72, yet the rho is + .95882. After 318 iteartions, this will also report a rho of .95882 (which sugguests SEVERE + autocorrelation- nothing close to 1.72. + + At anyrate, the real prototype for this is the example from Applied Linear Statistcal Models + 5th Edition by Kunter, Nachstheim, Neter, and Li. They also provide some interesting notes on p 494: + 1) "Cochrane-Orcutt does not always work properly. A major reason is that when the error terms + are positively autocorrelated, the estimate r in (12.22) tends to underestimate the autocorrelation + parameter rho. When this bias is serious, it can significantly reduce the effectiveness of the + Cochrane-Orcutt approach. + 2. There exists an approximate relation between the Durbin Watson test statistic D in (12.14) + and the estimated autocorrelation paramater r in (12.22): + D ~= 2(1-r)" + + They also note on p492: + "... If the process does not terminate after one or two iterations, a different procedure + should be employed." + This differs from the logic found elsewhere, and the method presented in R where, in the simple + example in the prototype, the procedure runs for 318 iterations. This is why the default + maximum iteratoins are 3, and should be left as such. + + Also, the prototype and 'correct answers' are based on the example presented in Kunter et. al on + p492-4 (including dataset). + + */ + val alsmBlaisdellCo = drmParallelize( dense( + (20.96, 127.3), + (21.40, 130.0), + (21.96, 132.7), + (21.52, 129.4), + (22.39, 135.0), + (22.76, 137.1), + (23.48, 141.2), + (23.66, 142.8), + (24.10, 145.5), + (24.01, 145.3), + (24.54, 148.3), + (24.30, 146.4), + (25.00, 150.2), + (25.64, 153.1), + (26.36, 157.3), + (26.98, 160.7), + (27.52, 164.2), + (27.78, 165.6), + (28.24, 168.7), + (28.78, 171.7) )) + + val drmY = alsmBlaisdellCo(::, 0 until 1) + val drmX = alsmBlaisdellCo(::, 1 until 2) + + var coModel = new CochraneOrcutt[Int]().fit(drmX, drmY , ('iterations -> 2)) + val coResiduals = drmY - coModel.predict(drmX) + + val correctRho = 0.631166 + (coModel.rhos(1) - correctRho) should be < epsilon + + val shortEpsilon = 1E-4 // book rounded off pretty short + val correctBeta = dvec(0.17376, -1.0685) + (coModel.betas(1) - correctBeta).sum.abs < shortEpsilon + + val correctSe = dvec(0.002957, 0.45332) + (coModel.se - correctSe).sum.abs < shortEpsilon + } }
