mahout git commit: [MAHOUT-1924][MAHOUT-1925] Add DW and Unit Tests to CochraneOrcutt closes apache/mahout#287 [Forced Update!]

rawkintrevo Sun, 26 Feb 2017 20:57:06 -0800

Repository: mahout
Updated Branches:
  refs/heads/master 9b3c48ef7 -> d848625df (forced update)



[MAHOUT-1924][MAHOUT-1925] Add DW and Unit Tests to CochraneOrcutt closes 
apache/mahout#287


Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/d848625d
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/d848625d
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/d848625d

Branch: refs/heads/master
Commit: d848625df7144ef14dd90300adcf253f08aa28e4
Parents: eb278dc
Author: rawkintrevo <[email protected]>
Authored: Sun Feb 26 22:48:30 2017 -0600
Committer: rawkintrevo <[email protected]>
Committed: Sun Feb 26 22:55:35 2017 -0600

----------------------------------------------------------------------
 .../regression/CochraneOrcuttModel.scala        | 109 ++++++++++++++-----
 .../regression/LinearRegressorModel.scala       |  15 ++-
 .../regression/OrdinaryLeastSquaresModel.scala  |   7 +-
 .../algorithms/regression/RegressorModel.scala  |   2 +
 .../math/algorithms/RegressionSuiteBase.scala   | 101 ++++++++++++++++-
 5 files changed, 199 insertions(+), 35 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mahout/blob/d848625d/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/CochraneOrcuttModel.scala
----------------------------------------------------------------------
diff --git 
a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/CochraneOrcuttModel.scala
 
b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/CochraneOrcuttModel.scala
index 844e72f..3e5a496 100644
--- 
a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/CochraneOrcuttModel.scala
+++ 
b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/CochraneOrcuttModel.scala
@@ -19,15 +19,20 @@
 
 package org.apache.mahout.math.algorithms.regression
 
-import org.apache.mahout.math.{Vector => MahoutVector}
+import org.apache.mahout.math.algorithms.regression.tests._
 import org.apache.mahout.math.drm.{CacheHint, DrmLike, safeToNonNegInt}
 import org.apache.mahout.math.drm.RLikeDrmOps._
+import org.apache.mahout.math.function.Functions
 import org.apache.mahout.math.scalabindings.RLikeOps._
+import org.apache.mahout.math.{Vector => MahoutVector}
+
 
 class CochraneOrcuttModel[K](regressor: LinearRegressorModel[K]) extends 
LinearRegressorModel[K] {
   // https://en.wikipedia.org/wiki/Cochrane%E2%80%93Orcutt_estimation
 
   var betas: Array[MahoutVector] = _
+  var dws: Array[Double] = _
+  var rhos: Array[Double] = _
 
   def predict(drmPredictors: DrmLike[K]): DrmLike[K] = {
     regressor.predict(drmPredictors)
@@ -37,10 +42,9 @@ class CochraneOrcuttModel[K](regressor: 
LinearRegressorModel[K]) extends LinearR
 
 class CochraneOrcutt[K](hyperparameters: (Symbol, Any)*)  extends 
LinearRegressorFitter[K] {
 
-  var regressor: LinearRegressorFitter[K] = 
hyperparameters.asInstanceOf[Map[Symbol,
-    LinearRegressorFitter[K]]].getOrElse('regressor, new 
OrdinaryLeastSquares[K]())
-  var iterations: Int = hyperparameters.asInstanceOf[Map[Symbol, 
Int]].getOrElse('iterations, 3)
-  var cacheHint: CacheHint.CacheHint = 
hyperparameters.asInstanceOf[Map[Symbol, 
CacheHint.CacheHint]].getOrElse('cacheHint, CacheHint.MEMORY_ONLY)
+  var regressor: LinearRegressorFitter[K] = _
+  var iterations: Int = _
+  var cacheHint: CacheHint.CacheHint = _
   // For larger inputs, CacheHint.MEMORY_AND_DISK2 is reccomended.
 
   def setHyperparameters(hyperparameters: Map[Symbol, Any] = Map('foo -> 
None)): Unit = {
@@ -54,47 +58,94 @@ class CochraneOrcutt[K](hyperparameters: (Symbol, Any)*)  
extends LinearRegresso
 
   setHyperparameters(hyperparameters.toMap)
 
+  def calculateRho(errorDrm: DrmLike[K]): Double ={
+    val error = errorDrm.collect.viewColumn(0)
+    val n = error.length - 1
+    val e2: MahoutVector = error.viewPart(1, n)
+    val e3: MahoutVector = error.viewPart(0, n)
+    // regression through the origin lm(e2 ~e3 -1) is sum(e2 * e3) / e3^2
+    e3.times(e2).sum / e3.assign(Functions.SQUARE).sum
+  }
+
   def fit(drmFeatures: DrmLike[K], drmTarget: DrmLike[K], hyperparameters: 
(Symbol, Any)*): CochraneOrcuttModel[K] = {
 
-    var hyperparameters: Option[Map[String,Any]] = None
+    setHyperparameters(hyperparameters.toMap[Symbol, Any])
+
     val betas = new Array[MahoutVector](iterations)
-    var regressionModel: LinearRegressorModel[K] = regressor.fit(drmFeatures, 
drmTarget)
-    betas(0) = regressionModel.beta
-    // todo add dw test option on each iteration
+    val models = new Array[LinearRegressorModel[K]](iterations)
+    val dws = new Array[Double](iterations)
+    val rhos = new Array[Double](iterations)
 
-    val drmY = drmTarget
     val n = safeToNonNegInt(drmTarget.nrow)
     val Y = drmTarget(1 until n, 0 until 1).checkpoint(cacheHint)
     val Y_lag = drmTarget(0 until n - 1, 0 until 1).checkpoint(cacheHint)
-    val X = drmFeatures(1 until n, 0 until 1).checkpoint(cacheHint)
-    val X_lag = drmFeatures(0 until n - 1, 0 until 1).checkpoint(cacheHint)
+    val X = drmFeatures(1 until n, 0 until 
drmFeatures.ncol).checkpoint(cacheHint)
+    val X_lag = drmFeatures(0 until n - 1, 0 until 
drmFeatures.ncol).checkpoint(cacheHint)
+
+    // Step 1: Normal Regression
+    regressor.calcStandardErrors = true
+    regressor.calcCommonStatistics = true
+    models(0) = regressor.fit(drmFeatures, drmTarget)
+    regressor.calcStandardErrors = false
+    regressor.calcCommonStatistics = false
+    betas(0) = models(0).beta
+    var residuals = drmTarget - models(0).predict(drmFeatures)
+
     for (i <- 1 until iterations){
-      val error = drmTarget - regressionModel.predict(drmFeatures)
-      regressionModel = regressor.fit(drmFeatures, drmTarget)
-      val rho = regressionModel.beta.get(0)
+      // Step 2: Calculate Rho
+      val rho_hat = calculateRho(residuals)
+      rhos(i-1) = rho_hat
 
-      val drmYprime = Y - Y_lag * rho
-      val drmXprime = X - X_lag * rho
+      // Step 3: Transform Variables
+      val drmYprime = Y - (Y_lag * rho_hat)
+      val drmXprime = X - (X_lag * rho_hat)
 
+      // Step 4: Get Estimates of Transformed Equation
       if (i == iterations - 1 ){
-        // calculate common stats and SE on last iteration only
-        // todo make this optional- but if you don't care then why are you 
even bothering to do this?
+        // get standard errors on last iteration only
         regressor.calcStandardErrors = true
         regressor.calcCommonStatistics = true
       }
-      regressionModel = regressor.fit(drmFeatures, drmTarget)
-      var betaPrime = regressionModel.beta
-      val b0 = betaPrime(0) / (1 - rho)
-      betaPrime(0) = b0
-      betas(i) = betaPrime
+      models(i) = regressor.fit(drmXprime, drmYprime)
+      // Make this optional- only for parity with R reported dw-stat, doesn't 
really mean anything
+      dws(i) = AutocorrelationTests.DurbinWatson( models(i),
+                                                  drmTarget - 
models(i).predict(drmFeatures))
+        .testResults.get('durbinWatsonTestStatistic).get.asInstanceOf[Double]
+
+      models(i).beta(X.ncol) = models(i).beta(X.ncol) / (1 - rho_hat) // 
intercept adjust
+      betas(i) = models(i).beta
+
+      // Step 5: Use Betas from (4) to recalculate model from (1)
+      residuals = drmTarget - models(i).predict(drmFeatures)
+
+      /** Step 6: repeat  Step 2 through 5 until a stopping criteria is met.
+        *  some models call for convergence-
+        *   Kunter et. al reccomend 3 iterations, if you don't achieve desired 
results, use
+        *   an alternative method.
+        **/
     }
 
-    val model = new CochraneOrcuttModel[K](regressionModel)
-    model.betas = betas
-    model.summary = (0 until iterations).map(i â s"Beta estimates on 
iteration " + i + ": "
-      + model.betas.toString + "\n").mkString("") + "\n\n" + "Final 
Model:\n\n" + regressionModel.summary
+    var finalModel = new CochraneOrcuttModel[K](models(iterations -1))
+    finalModel.betas = betas
+    finalModel.dws = dws
+    finalModel.rhos = rhos
+    finalModel.tScore = models(iterations -1).tScore
+    finalModel.pval =  models(iterations -1).pval
+    finalModel.beta = models(iterations -1).beta
+    val se = models(iterations -1).se
+    se(se.length -1) = se(se.length -1) / (1 - rhos(iterations - 2))
+    finalModel.se = se
+    finalModel.summary = "Original Model:\n" + models(0).summary +
+    "\n\nTransformed Model:\n" +
+      generateSummaryString(finalModel) +
+    "\n\nfinal rho: " + finalModel.rhos(iterations - 2) +
+    s"\nMSE: ${models(iterations -1 ).mse}\nR2: ${models(iterations -1 ).r2}\n"
+
+    if (models(0).addIntercept == true){
+      finalModel.summary = finalModel.summary.replace(s"X${X.ncol}", 
"(Intercept)")
+    }
 
-    model
+    finalModel
   }
 
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/d848625d/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala
----------------------------------------------------------------------
diff --git 
a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala
 
b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala
index 555ee6c..2583795 100644
--- 
a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala
+++ 
b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala
@@ -40,7 +40,6 @@ trait LinearRegressorModel[K] extends RegressorModel[K] {
 
 trait LinearRegressorFitter[K] extends RegressorFitter[K] {
 
-  var addIntercept: Boolean = _
   var calcStandardErrors: Boolean = _
   var calcCommonStatistics: Boolean = _
 
@@ -75,14 +74,13 @@ trait LinearRegressorFitter[K] extends RegressorFitter[K] {
     val pval = dvec(tScore.toArray.map(t => 2 * (1.0 - 
tDist.cumulativeProbability(t)) ))
     // ^^ TODO bug in this calculation- fix and add test
     //degreesFreedom = k
-    modelOut.summary = "Coef.\t\tEstimate\t\tStd. 
Error\t\tt-score\t\t\tPr(Beta=0)\n" +
-      (0 until k).map(i => 
s"X${i}\t${model.beta(i)}\t${se(i)}\t${tScore(i)}\t${pval(i)}").mkString("\n")
+
 
     modelOut.se = se
     modelOut.tScore = tScore
     modelOut.pval = pval
     modelOut.degreesFreedom = X.ncol
-
+    modelOut.summary = generateSummaryString(modelOut)
     if (calcCommonStatistics){
       modelOut = calculateCommonStatistics(modelOut, drmTarget, residuals)
     }
@@ -95,6 +93,7 @@ trait LinearRegressorFitter[K] extends RegressorFitter[K] {
     var modelOut = model
     modelOut = FittnessTests.CoefficientOfDetermination(model, drmTarget, 
residuals)
     modelOut = FittnessTests.MeanSquareError(model, residuals)
+    modelOut.summary += s"MSE: ${modelOut.mse}\nR2: ${modelOut.r2}\n"
     modelOut
   }
 
@@ -118,7 +117,15 @@ trait LinearRegressorFitter[K] extends RegressorFitter[K] {
 
     if (addIntercept) {
       model.summary.replace(s"X${X.ncol - 1}", "(Intercept)")
+      model.addIntercept = true
     }
     model
   }
+
+  def generateSummaryString[M[K] <: LinearRegressorModel[K]](model: M[K]): 
String = {
+    val k = model.beta.length
+    "Coef.\t\tEstimate\t\tStd. Error\t\tt-score\t\t\tPr(Beta=0)\n" +
+      (0 until k).map(i => 
s"X${i}\t${model.beta(i)}\t${model.se(i)}\t${model.tScore(i)}\t${model.pval(i)}").mkString("\n")
+
+  }
 }

http://git-wip-us.apache.org/repos/asf/mahout/blob/d848625d/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala
----------------------------------------------------------------------
diff --git 
a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala
 
b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala
index 682cf1c..58eb45d 100644
--- 
a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala
+++ 
b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala
@@ -29,7 +29,11 @@ class OrdinaryLeastSquaresModel[K]
   // https://en.wikipedia.org/wiki/Ordinary_least_squares
 
   def predict(drmPredictors: DrmLike[K]): DrmLike[K] = {
-    drmPredictors %*% beta
+    var X = drmPredictors
+    if (addIntercept) {
+      X = X cbind 1
+    }
+    X %*% beta
   }
 
 }
@@ -41,6 +45,7 @@ class OrdinaryLeastSquares[K] extends 
LinearRegressorFitter[K] {
           drmTarget: DrmLike[K],
           hyperparameters: (Symbol, Any)*): OrdinaryLeastSquaresModel[K] = {
 
+    assert(drmTarget.ncol == 1, s"drmTarget must be a single column matrix, 
found ${drmTarget.ncol} columns")
     var model = new OrdinaryLeastSquaresModel[K]()
     setStandardHyperparameters(hyperparameters.toMap)
 

http://git-wip-us.apache.org/repos/asf/mahout/blob/d848625d/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala
----------------------------------------------------------------------
diff --git 
a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala
 
b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala
index bdddb29..8a4b7d2 100644
--- 
a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala
+++ 
b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala
@@ -26,6 +26,7 @@ trait RegressorModel[K] extends SupervisedModel[K] {
 
   def predict(drmPredictors: DrmLike[K]): DrmLike[K]
 
+  var addIntercept: Boolean = _
   // Common Applicable Tests- here only for convenience.
   var mse: Double = _
   var r2: Double = _
@@ -43,6 +44,7 @@ trait RegressorModel[K] extends SupervisedModel[K] {
 
 trait RegressorFitter[K] extends SupervisedFitter[K, RegressorModel[K]] {
 
+  var addIntercept: Boolean = _
 
   def fitPredict(drmX: DrmLike[K],
                  drmTarget: DrmLike[K],

http://git-wip-us.apache.org/repos/asf/mahout/blob/d848625d/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuiteBase.scala
----------------------------------------------------------------------
diff --git 
a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuiteBase.scala
 
b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuiteBase.scala
index 2bb0343..8910ae9 100644
--- 
a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuiteBase.scala
+++ 
b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuiteBase.scala
@@ -17,7 +17,7 @@
 
 package org.apache.mahout.math.algorithms
 
-import org.apache.mahout.math.algorithms.regression.OrdinaryLeastSquares
+import org.apache.mahout.math.algorithms.regression._
 import org.apache.mahout.math.drm._
 import org.apache.mahout.math.drm.RLikeDrmOps._
 import org.apache.mahout.math.scalabindings._
@@ -28,6 +28,8 @@ import org.scalatest.{FunSuite, Matchers}
 trait RegressionSuiteBase extends DistributedMahoutSuite with Matchers {
   this: FunSuite =>
 
+  val epsilon = 1E-6
+
   test("ordinary least squares") {
     /*
     R Prototype:
@@ -76,6 +78,103 @@ trait RegressionSuiteBase extends DistributedMahoutSuite 
with Matchers {
     // TODO add test for S.E / pvalue
   }
 
+  test("cochrane-orcutt"){
+    /* R Prototype:
+    library(orcutt)
+
+    df = data.frame(t(data.frame(
+        c(20.96,  127.3),
+        c(21.40,  130.0),
+        c(21.96,  132.7),
+        c(21.52,  129.4),
+        c(22.39,  135.0),
+        c(22.76,  137.1),
+        c(23.48,  141.2),
+        c(23.66,  142.8),
+        c(24.10,  145.5),
+        c(24.01,  145.3),
+        c(24.54,  148.3),
+        c(24.30,  146.4),
+        c(25.00,  150.2),
+        c(25.64,  153.1),
+        c(26.36,  157.3),
+        c(26.98,  160.7),
+        c(27.52,  164.2),
+        c(27.78,  165.6),
+        c(28.24,  168.7),
+        c(28.78,  171.7))))
+
+    rownames(df) <- NULL
+    colnames(df) <- c("y", "x")
+    my_lm = lm(y ~ x, data=df)
+    coch = cochrane.orcutt(my_lm)
+
+    ///////////////////////////////////////
+    The R-implementation is kind of...silly.
+
+    The above works- converges at 318 iterations- the transformed DW is   
1.72, yet the rho is
+     .95882.   After 318 iteartions, this will also report a rho of .95882 
(which sugguests SEVERE
+     autocorrelation- nothing close to 1.72.
+
+     At anyrate, the real prototype for this is the example from Applied 
Linear Statistcal Models
+     5th Edition by Kunter, Nachstheim, Neter, and Li.  They also provide some 
interesting notes on p 494:
+     1) "Cochrane-Orcutt does not always work properly.  A major reason is 
that when the error terms
+     are positively autocorrelated, the estimate r in (12.22) tends to 
underestimate the autocorrelation
+     parameter rho.  When this bias is serious, it can significantly reduce 
the effectiveness of the
+     Cochrane-Orcutt approach.
+     2. There exists an approximate relation between the Durbin Watson test 
statistic D in (12.14)
+     and the estimated autocorrelation paramater r in (12.22):
+     D ~= 2(1-r)"
+
+     They also note on p492:
+     "... If the process does not terminate after one or two iterations, a 
different procedure
+     should be employed."
+     This differs from the logic found elsewhere, and the method presented in 
R where, in the simple
+      example in the prototype, the procedure runs for 318 iterations. This is 
why the default
+     maximum iteratoins are 3, and should be left as such.
+
+     Also, the prototype and 'correct answers' are based on the example 
presented in Kunter et. al on
+     p492-4 (including dataset).
+
+     */
 
+    val alsmBlaisdellCo = drmParallelize( dense(
+      (20.96,  127.3),
+      (21.40,  130.0),
+      (21.96,  132.7),
+      (21.52,  129.4),
+      (22.39,  135.0),
+      (22.76,  137.1),
+      (23.48,  141.2),
+      (23.66,  142.8),
+      (24.10,  145.5),
+      (24.01,  145.3),
+      (24.54,  148.3),
+      (24.30,  146.4),
+      (25.00,  150.2),
+      (25.64,  153.1),
+      (26.36,  157.3),
+      (26.98,  160.7),
+      (27.52,  164.2),
+      (27.78,  165.6),
+      (28.24,  168.7),
+      (28.78,  171.7) ))
+
+    val drmY = alsmBlaisdellCo(::, 0 until 1)
+    val drmX = alsmBlaisdellCo(::, 1 until 2)
+
+    var coModel = new CochraneOrcutt[Int]().fit(drmX, drmY , ('iterations -> 
2))
+    val coResiduals = drmY - coModel.predict(drmX)
+
+    val correctRho = 0.631166
+    (coModel.rhos(1) - correctRho) should be < epsilon
+
+    val shortEpsilon = 1E-4 // book rounded off pretty short
+    val correctBeta = dvec(0.17376, -1.0685)
+    (coModel.betas(1) - correctBeta).sum.abs < shortEpsilon
+
+    val correctSe = dvec(0.002957, 0.45332)
+    (coModel.se - correctSe).sum.abs < shortEpsilon
+  }
 
 }

mahout git commit: [MAHOUT-1924][MAHOUT-1925] Add DW and Unit Tests to CochraneOrcutt closes apache/mahout#287 [Forced Update!]

Reply via email to