http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/classifier/naivebayes/NBTestBase.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/test/scala/org/apache/mahout/classifier/naivebayes/NBTestBase.scala b/math-scala/src/test/scala/org/apache/mahout/classifier/naivebayes/NBTestBase.scala deleted file mode 100644 index c8f8a90..0000000 --- a/math-scala/src/test/scala/org/apache/mahout/classifier/naivebayes/NBTestBase.scala +++ /dev/null @@ -1,291 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.naivebayes - -import org.apache.mahout.math._ -import org.apache.mahout.math.scalabindings._ -import org.apache.mahout.test.DistributedMahoutSuite -import org.apache.mahout.test.MahoutSuite -import org.scalatest.{FunSuite, Matchers} -import collection._ -import JavaConversions._ -import collection.JavaConversions - -trait NBTestBase extends DistributedMahoutSuite with Matchers { this:FunSuite => - - val epsilon = 1E-6 - - test("Simple Standard NB Model") { - - // test from simulated sparse TF-IDF data - val inCoreTFIDF = sparse( - (0, 0.7) ::(1, 0.1) ::(2, 0.1) ::(3, 0.3) :: Nil, - (0, 0.4) ::(1, 0.4) ::(2, 0.1) ::(3, 0.1) :: Nil, - (0, 0.1) ::(1, 0.0) ::(2, 0.8) ::(3, 0.1) :: Nil, - (0, 0.1) ::(1, 0.1) ::(2, 0.1) ::(3, 0.7) :: Nil - ) - - val TFIDFDrm = drm.drmParallelize(m = inCoreTFIDF, numPartitions = 2) - - val labelIndex = new java.util.HashMap[String,Integer]() - labelIndex.put("Cat1", 3) - labelIndex.put("Cat2", 2) - labelIndex.put("Cat3", 1) - labelIndex.put("Cat4", 0) - - // train a Standard NB Model - val model = NaiveBayes.train(TFIDFDrm, labelIndex, false) - - // validate the model- will throw an exception if model is invalid - model.validate() - - // check the labelWeights - model.labelWeight(0) - 1.2 should be < epsilon - model.labelWeight(1) - 1.0 should be < epsilon - model.labelWeight(2) - 1.0 should be < epsilon - model.labelWeight(3) - 1.0 should be < epsilon - - // check the Feature weights - model.featureWeight(0) - 1.3 should be < epsilon - model.featureWeight(1) - 0.6 should be < epsilon - model.featureWeight(2) - 1.1 should be < epsilon - model.featureWeight(3) - 1.2 should be < epsilon - } - - test("NB Aggregator") { - - val rowBindings = new java.util.HashMap[String,Integer]() - rowBindings.put("/Cat1/doc_a/", 0) - rowBindings.put("/Cat2/doc_b/", 1) - rowBindings.put("/Cat1/doc_c/", 2) - rowBindings.put("/Cat2/doc_d/", 3) - rowBindings.put("/Cat1/doc_e/", 4) - - - val matrixSetup = sparse( - (0, 0.1) ::(1, 0.0) ::(2, 0.1) ::(3, 0.0) :: Nil, - (0, 0.0) ::(1, 0.1) ::(2, 0.0) ::(3, 0.1) :: Nil, - (0, 0.1) ::(1, 0.0) ::(2, 0.1) ::(3, 0.0) :: Nil, - (0, 0.0) ::(1, 0.1) ::(2, 0.0) ::(3, 0.1) :: Nil, - (0, 0.1) ::(1, 0.0) ::(2, 0.1) ::(3, 0.0) :: Nil - ) - - - matrixSetup.setRowLabelBindings(rowBindings) - - val TFIDFDrm = drm.drmParallelizeWithRowLabels(m = matrixSetup, numPartitions = 2) - - val (labelIndex, aggregatedTFIDFDrm) = NaiveBayes.extractLabelsAndAggregateObservations(TFIDFDrm) - - labelIndex.size should be (2) - - val cat1=labelIndex("Cat1") - val cat2=labelIndex("Cat2") - - cat1 should be (0) - cat2 should be (1) - - val aggregatedTFIDFInCore = aggregatedTFIDFDrm.collect - aggregatedTFIDFInCore.numCols should be (4) - aggregatedTFIDFInCore.numRows should be (2) - - aggregatedTFIDFInCore.get(cat1, 0) - 0.3 should be < epsilon - aggregatedTFIDFInCore.get(cat1, 1) - 0.0 should be < epsilon - aggregatedTFIDFInCore.get(cat1, 2) - 0.3 should be < epsilon - aggregatedTFIDFInCore.get(cat1, 3) - 0.0 should be < epsilon - aggregatedTFIDFInCore.get(cat2, 0) - 0.0 should be < epsilon - aggregatedTFIDFInCore.get(cat2, 1) - 0.2 should be < epsilon - aggregatedTFIDFInCore.get(cat2, 2) - 0.0 should be < epsilon - aggregatedTFIDFInCore.get(cat2, 3) - 0.2 should be < epsilon - - } - - test("Model DFS Serialization") { - - // test from simulated sparse TF-IDF data - val inCoreTFIDF = sparse( - (0, 0.7) ::(1, 0.1) ::(2, 0.1) ::(3, 0.3) :: Nil, - (0, 0.4) ::(1, 0.4) ::(2, 0.1) ::(3, 0.1) :: Nil, - (0, 0.1) ::(1, 0.0) ::(2, 0.8) ::(3, 0.1) :: Nil, - (0, 0.1) ::(1, 0.1) ::(2, 0.1) ::(3, 0.7) :: Nil - ) - - val labelIndex = new java.util.HashMap[String,Integer]() - labelIndex.put("Cat1", 0) - labelIndex.put("Cat2", 1) - labelIndex.put("Cat3", 2) - labelIndex.put("Cat4", 3) - - val TFIDFDrm = drm.drmParallelize(m = inCoreTFIDF, numPartitions = 2) - - // train a Standard NB Model- no label index here - val model = NaiveBayes.train(TFIDFDrm, labelIndex, false) - - // validate the model- will throw an exception if model is invalid - model.validate() - - // save the model - model.dfsWrite(TmpDir) - - // reload a new model which should be equal to the original - // this will automatically trigger a validate() call - val materializedModel= NBModel.dfsRead(TmpDir) - - - // check the labelWeights - model.labelWeight(0) - materializedModel.labelWeight(0) should be < epsilon //1.2 - model.labelWeight(1) - materializedModel.labelWeight(1) should be < epsilon //1.0 - model.labelWeight(2) - materializedModel.labelWeight(2) should be < epsilon //1.0 - model.labelWeight(3) - materializedModel.labelWeight(3) should be < epsilon //1.0 - - // check the Feature weights - model.featureWeight(0) - materializedModel.featureWeight(0) should be < epsilon //1.3 - model.featureWeight(1) - materializedModel.featureWeight(1) should be < epsilon //0.6 - model.featureWeight(2) - materializedModel.featureWeight(2) should be < epsilon //1.1 - model.featureWeight(3) - materializedModel.featureWeight(3) should be < epsilon //1.2 - - // check to se if the new model is complementary - materializedModel.isComplementary should be (model.isComplementary) - - // check the label indexMaps - for(elem <- model.labelIndex){ - model.labelIndex(elem._1) == materializedModel.labelIndex(elem._1) should be (true) - } - } - - test("train and test a model") { - - // test from simulated sparse TF-IDF data - val inCoreTFIDF = sparse( - (0, 0.7) ::(1, 0.1) ::(2, 0.1) ::(3, 0.3) :: Nil, - (0, 0.4) ::(1, 0.4) ::(2, 0.1) ::(3, 0.1) :: Nil, - (0, 0.1) ::(1, 0.0) ::(2, 0.8) ::(3, 0.1) :: Nil, - (0, 0.1) ::(1, 0.1) ::(2, 0.1) ::(3, 0.7) :: Nil - ) - - val labelIndex = new java.util.HashMap[String,Integer]() - labelIndex.put("/Cat1/", 0) - labelIndex.put("/Cat2/", 1) - labelIndex.put("/Cat3/", 2) - labelIndex.put("/Cat4/", 3) - - val TFIDFDrm = drm.drmParallelize(m = inCoreTFIDF, numPartitions = 2) - - // train a Standard NB Model- no label index here - val model = NaiveBayes.train(TFIDFDrm, labelIndex, false) - - // validate the model- will throw an exception if model is invalid - model.validate() - - // save the model - model.dfsWrite(TmpDir) - - // reload a new model which should be equal to the original - // this will automatically trigger a validate() call - val materializedModel= NBModel.dfsRead(TmpDir) - - - // check to se if the new model is complementary - materializedModel.isComplementary should be (model.isComplementary) - - // check the label indexMaps - for(elem <- model.labelIndex){ - model.labelIndex(elem._1) == materializedModel.labelIndex(elem._1) should be (true) - } - - - //self test on the original set - val inCoreTFIDFWithLabels = inCoreTFIDF.clone() - inCoreTFIDFWithLabels.setRowLabelBindings(labelIndex) - val TFIDFDrmWithLabels = drm.drmParallelizeWithRowLabels(m = inCoreTFIDFWithLabels, numPartitions = 2) - - NaiveBayes.test(materializedModel,TFIDFDrmWithLabels , false) - - } - - test("train and test a model with the confusion matrix") { - - val rowBindings = new java.util.HashMap[String,Integer]() - rowBindings.put("/Cat1/doc_a/", 0) - rowBindings.put("/Cat2/doc_b/", 1) - rowBindings.put("/Cat1/doc_c/", 2) - rowBindings.put("/Cat2/doc_d/", 3) - rowBindings.put("/Cat1/doc_e/", 4) - rowBindings.put("/Cat2/doc_f/", 5) - rowBindings.put("/Cat1/doc_g/", 6) - rowBindings.put("/Cat2/doc_h/", 7) - rowBindings.put("/Cat1/doc_i/", 8) - rowBindings.put("/Cat2/doc_j/", 9) - - val seed = 1 - - val matrixSetup = Matrices.uniformView(10, 50 , seed) - - println("TFIDF matrix") - println(matrixSetup) - - matrixSetup.setRowLabelBindings(rowBindings) - - val TFIDFDrm = drm.drmParallelizeWithRowLabels(matrixSetup) - - // println("Parallelized and Collected") - // println(TFIDFDrm.collect) - - val (labelIndex, aggregatedTFIDFDrm) = NaiveBayes.extractLabelsAndAggregateObservations(TFIDFDrm) - - println("Aggregated by key") - println(aggregatedTFIDFDrm.collect) - println(labelIndex) - - - // train a Standard NB Model- no label index here - val model = NaiveBayes.train(aggregatedTFIDFDrm, labelIndex, false) - - // validate the model- will throw an exception if model is invalid - model.validate() - - // save the model - model.dfsWrite(TmpDir) - - // reload a new model which should be equal to the original - // this will automatically trigger a validate() call - val materializedModel= NBModel.dfsRead(TmpDir) - - // check to se if the new model is complementary - materializedModel.isComplementary should be (model.isComplementary) - - // check the label indexMaps - for(elem <- model.labelIndex){ - model.labelIndex(elem._1) == materializedModel.labelIndex(elem._1) should be (true) - } - - // val testTFIDFDrm = drm.drmParallelizeWithRowLabels(m = matrixSetup, numPartitions = 2) - - // self test on this model - val result = NaiveBayes.test(materializedModel, TFIDFDrm , false) - - println(result) - - result.getConfusionMatrix.getMatrix.getQuick(0, 0) should be(5) - result.getConfusionMatrix.getMatrix.getQuick(0, 1) should be(0) - result.getConfusionMatrix.getMatrix.getQuick(1, 0) should be(0) - result.getConfusionMatrix.getMatrix.getQuick(1, 1) should be(5) - - } - -}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/classifier/stats/ClassifierStatsTestBase.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/test/scala/org/apache/mahout/classifier/stats/ClassifierStatsTestBase.scala b/math-scala/src/test/scala/org/apache/mahout/classifier/stats/ClassifierStatsTestBase.scala deleted file mode 100644 index eafde11..0000000 --- a/math-scala/src/test/scala/org/apache/mahout/classifier/stats/ClassifierStatsTestBase.scala +++ /dev/null @@ -1,257 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.classifier.stats - -import java.lang.Double -import java.util.Random -import java.util.Arrays - -import org.apache.mahout.common.RandomUtils -import org.apache.mahout.math.Matrix -import org.apache.mahout.test.DistributedMahoutSuite -import org.scalatest.{FunSuite, Matchers} - - - -trait ClassifierStatsTestBase extends DistributedMahoutSuite with Matchers { this: FunSuite => - - val epsilon = 1E-6 - - val smallEpsilon = 1.0 - - // FullRunningAverageAndStdDev tests - test("testFullRunningAverageAndStdDev") { - val average: RunningAverageAndStdDev = new FullRunningAverageAndStdDev - assert(0 == average.getCount) - assert(true == Double.isNaN(average.getAverage)) - assert(true == Double.isNaN(average.getStandardDeviation)) - average.addDatum(6.0) - assert(1 == average.getCount) - assert((6.0 - average.getAverage).abs < epsilon) - assert(true == Double.isNaN(average.getStandardDeviation)) - average.addDatum(6.0) - assert(2 == average.getCount) - assert((6.0 - average.getAverage).abs < epsilon) - assert((0.0 - average.getStandardDeviation).abs < epsilon) - average.removeDatum(6.0) - assert(1 == average.getCount) - assert((6.0 - average.getAverage).abs < epsilon) - assert(true == Double.isNaN(average.getStandardDeviation)) - average.addDatum(-4.0) - assert(2 == average.getCount) - assert((1.0 - average.getAverage).abs < epsilon) - assert(((5.0 * 1.4142135623730951) - average.getStandardDeviation).abs < epsilon) - average.removeDatum(4.0) - assert(1 == average.getCount) - assert((2.0 + average.getAverage).abs < epsilon) - assert(true == Double.isNaN(average.getStandardDeviation)) - } - - test("testBigFullRunningAverageAndStdDev") { - val average: RunningAverageAndStdDev = new FullRunningAverageAndStdDev - RandomUtils.useTestSeed() - val r: Random = RandomUtils.getRandom - - for (i <- 0 until 100000) { - average.addDatum(r.nextDouble() * 1000.0) - } - - assert((500.0 - average.getAverage).abs < smallEpsilon) - assert(((1000.0 / Math.sqrt(12.0)) - average.getStandardDeviation).abs < smallEpsilon) - } - - test("testStddevFullRunningAverageAndStdDev") { - val runningAverage: RunningAverageAndStdDev = new FullRunningAverageAndStdDev - assert(0 == runningAverage.getCount) - assert(true == Double.isNaN(runningAverage.getAverage)) - runningAverage.addDatum(1.0) - assert(1 == runningAverage.getCount) - assert((1.0 - runningAverage.getAverage).abs < epsilon) - assert(true == Double.isNaN(runningAverage.getStandardDeviation)) - runningAverage.addDatum(1.0) - assert(2 == runningAverage.getCount) - assert((1.0 - runningAverage.getAverage).abs < epsilon) - assert((0.0 -runningAverage.getStandardDeviation).abs < epsilon) - runningAverage.addDatum(7.0) - assert(3 == runningAverage.getCount) - assert((3.0 - runningAverage.getAverage).abs < epsilon) - assert((3.464101552963257 - runningAverage.getStandardDeviation).abs < epsilon) - runningAverage.addDatum(5.0) - assert(4 == runningAverage.getCount) - assert((3.5 - runningAverage.getAverage) < epsilon) - assert((3.0- runningAverage.getStandardDeviation).abs < epsilon) - } - - - - // FullRunningAverage tests - test("testFullRunningAverage"){ - val runningAverage: RunningAverage = new FullRunningAverage - assert(0 == runningAverage.getCount) - assert(true == Double.isNaN(runningAverage.getAverage)) - runningAverage.addDatum(1.0) - assert(1 == runningAverage.getCount) - assert((1.0 - runningAverage.getAverage).abs < epsilon) - runningAverage.addDatum(1.0) - assert(2 == runningAverage.getCount) - assert((1.0 - runningAverage.getAverage).abs < epsilon) - runningAverage.addDatum(4.0) - assert(3 == runningAverage.getCount) - assert((2.0 - runningAverage.getAverage) < epsilon) - runningAverage.addDatum(-4.0) - assert(4 == runningAverage.getCount) - assert((0.5 - runningAverage.getAverage).abs < epsilon) - runningAverage.removeDatum(-4.0) - assert(3 == runningAverage.getCount) - assert((2.0 - runningAverage.getAverage).abs < epsilon) - runningAverage.removeDatum(4.0) - assert(2 == runningAverage.getCount) - assert((1.0 - runningAverage.getAverage).abs < epsilon) - runningAverage.changeDatum(0.0) - assert(2 == runningAverage.getCount) - assert((1.0 - runningAverage.getAverage).abs < epsilon) - runningAverage.changeDatum(2.0) - assert(2 == runningAverage.getCount) - assert((2.0 - runningAverage.getAverage).abs < epsilon) - } - - - test("testFullRunningAveragCopyConstructor") { - val runningAverage: RunningAverage = new FullRunningAverage - runningAverage.addDatum(1.0) - runningAverage.addDatum(1.0) - assert(2 == runningAverage.getCount) - assert(1.0 - runningAverage.getAverage < epsilon) - val copy: RunningAverage = new FullRunningAverage(runningAverage.getCount, runningAverage.getAverage) - assert(2 == copy.getCount) - assert(1.0 - copy.getAverage < epsilon) - } - - - - // Inverted Running Average tests - test("testInvertedRunningAverage") { - val avg: RunningAverage = new FullRunningAverage - val inverted: RunningAverage = new InvertedRunningAverage(avg) - assert(0 == inverted.getCount) - avg.addDatum(1.0) - assert(1 == inverted.getCount) - assert((1.0 + inverted.getAverage).abs < epsilon) // inverted.getAverage == -1.0 - avg.addDatum(2.0) - assert(2 == inverted.getCount) - assert((1.5 + inverted.getAverage).abs < epsilon) // inverted.getAverage == -1.5 - } - - test ("testInvertedRunningAverageAndStdDev") { - val avg: RunningAverageAndStdDev = new FullRunningAverageAndStdDev - val inverted: RunningAverageAndStdDev = new InvertedRunningAverageAndStdDev(avg) - assert(0 == inverted.getCount) - avg.addDatum(1.0) - assert(1 == inverted.getCount) - assert(((1.0 + inverted.getAverage).abs < epsilon)) // inverted.getAverage == -1.0 - avg.addDatum(2.0) - assert(2 == inverted.getCount) - assert((1.5 + inverted.getAverage).abs < epsilon) // inverted.getAverage == -1.5 - assert(((Math.sqrt(2.0) / 2.0) - inverted.getStandardDeviation).abs < epsilon) - } - - - // confusion Matrix tests - val VALUES: Array[Array[Int]] = Array(Array(2, 3), Array(10, 20)) - val LABELS: Array[String] = Array("Label1", "Label2") - val OTHER: Array[Int] = Array(3, 6) - val DEFAULT_LABEL: String = "other" - - def fillConfusionMatrix(values: Array[Array[Int]], labels: Array[String], defaultLabel: String): ConfusionMatrix = { - val labelList = Arrays.asList(labels(0),labels(1)) - val confusionMatrix: ConfusionMatrix = new ConfusionMatrix(labelList, defaultLabel) - confusionMatrix.putCount("Label1", "Label1", values(0)(0)) - confusionMatrix.putCount("Label1", "Label2", values(0)(1)) - confusionMatrix.putCount("Label2", "Label1", values(1)(0)) - confusionMatrix.putCount("Label2", "Label2", values(1)(1)) - confusionMatrix.putCount("Label1", DEFAULT_LABEL, OTHER(0)) - confusionMatrix.putCount("Label2", DEFAULT_LABEL, OTHER(1)) - - confusionMatrix - } - - private def checkAccuracy(cm: ConfusionMatrix) { - val labelstrs = cm.getLabels - assert(3 == labelstrs.size) - assert((25.0 - cm.getAccuracy("Label1")).abs < epsilon) - assert((55.5555555 - cm.getAccuracy("Label2")).abs < epsilon) - assert(true == Double.isNaN(cm.getAccuracy("other"))) - } - - private def checkValues(cm: ConfusionMatrix) { - val counts: Array[Array[Int]] = cm.getConfusionMatrix - cm.toString - assert(counts.length == counts(0).length) - assert(3 == counts.length) - assert(VALUES(0)(0) == counts(0)(0)) - assert(VALUES(0)(1) == counts(0)(1)) - assert(VALUES(1)(0) == counts(1)(0)) - assert(VALUES(1)(1) == counts(1)(1)) - assert(true == Arrays.equals(new Array[Int](3), counts(2))) - assert(OTHER(0) == counts(0)(2)) - assert(OTHER(1) == counts(1)(2)) - assert(3 == cm.getLabels.size) - assert(true == cm.getLabels.contains(LABELS(0))) - assert(true == cm.getLabels.contains(LABELS(1))) - assert(true == cm.getLabels.contains(DEFAULT_LABEL)) - } - - test("testBuild"){ - val confusionMatrix: ConfusionMatrix = fillConfusionMatrix(VALUES, LABELS, DEFAULT_LABEL) - checkValues(confusionMatrix) - checkAccuracy(confusionMatrix) - } - - test("GetMatrix") { - val confusionMatrix: ConfusionMatrix = fillConfusionMatrix(VALUES, LABELS, DEFAULT_LABEL) - val m: Matrix = confusionMatrix.getMatrix - val rowLabels = m.getRowLabelBindings - assert(confusionMatrix.getLabels.size == m.numCols) - assert(true == rowLabels.keySet.contains(LABELS(0))) - assert(true == rowLabels.keySet.contains(LABELS(1))) - assert(true == rowLabels.keySet.contains(DEFAULT_LABEL)) - assert(2 == confusionMatrix.getCorrect(LABELS(0))) - assert(20 == confusionMatrix.getCorrect(LABELS(1))) - assert(0 == confusionMatrix.getCorrect(DEFAULT_LABEL)) - } - - /** - * Example taken from - * http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html - */ - test("testPrecisionRecallAndF1ScoreAsScikitLearn") { - val labelList = Arrays.asList("0", "1", "2") - val confusionMatrix: ConfusionMatrix = new ConfusionMatrix(labelList, "DEFAULT") - confusionMatrix.putCount("0", "0", 2) - confusionMatrix.putCount("1", "0", 1) - confusionMatrix.putCount("1", "2", 1) - confusionMatrix.putCount("2", "1", 2) - val delta: Double = 0.001 - assert((0.222 - confusionMatrix.getWeightedPrecision).abs < delta) - assert((0.333 - confusionMatrix.getWeightedRecall).abs < delta) - assert((0.266 - confusionMatrix.getWeightedF1score).abs < delta) - } - - - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuiteBase.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuiteBase.scala deleted file mode 100644 index 70fb10f..0000000 --- a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/ClusteringSuiteBase.scala +++ /dev/null @@ -1,48 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.mahout.math.algorithms - -import org.apache.mahout.math.algorithms.preprocessing._ -import org.apache.mahout.math.drm.drmParallelize -import org.apache.mahout.math.scalabindings.{dense, sparse, svec} -import org.apache.mahout.math.scalabindings.RLikeOps._ -import org.apache.mahout.test.DistributedMahoutSuite -import org.scalatest.{FunSuite, Matchers} - -import org.apache.mahout.test.DistributedMahoutSuite - -trait ClusteringSuiteBase extends DistributedMahoutSuite with Matchers { - - this: FunSuite => - - test("canopy test") { - val drmA = drmParallelize(dense((1.0, 1.2, 1.3, 1.4), (1.1, 1.5, 2.5, 1.0), (6.0, 5.2, -5.2, 5.3), (7.0,6.0, 5.0, 5.0), (10.0, 1.0, 20.0, -10.0))) - - import org.apache.mahout.math.algorithms.clustering.CanopyClustering - - val model = new CanopyClustering().fit(drmA, 't1 -> 6.5, 't2 -> 5.5, 'distanceMeasure -> 'Chebyshev) - val myAnswer = model.cluster(drmA).collect - - val correctAnswer = dense((0.0), (0.0), (1.0), (0.0), (2.0)) - - val epsilon = 1E-6 - (myAnswer.norm - correctAnswer.norm) should be <= epsilon - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuiteBase.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuiteBase.scala deleted file mode 100644 index ffe1d1b..0000000 --- a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuiteBase.scala +++ /dev/null @@ -1,118 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.mahout.math.algorithms - -import org.apache.mahout.math.algorithms.preprocessing._ -import org.apache.mahout.math.drm.drmParallelize -import org.apache.mahout.math.scalabindings.{dense, sparse, svec} -import org.apache.mahout.math.scalabindings.RLikeOps._ -import org.apache.mahout.test.DistributedMahoutSuite -import org.scalatest.{FunSuite, Matchers} - -trait PreprocessorSuiteBase extends DistributedMahoutSuite with Matchers { - - this: FunSuite => - - test("asfactor test") { - val A = drmParallelize(dense( - (3, 2, 1, 2), - (0, 0, 0, 0), - (1, 1, 1, 1)), numPartitions = 2) - - // 0 -> 2, 3 -> 5, 6 -> 9 - val factorizer: AsFactorModel = new AsFactor().fit(A) - - val factoredA = factorizer.transform(A) - - println(factoredA) - println(factorizer.factorMap) - val correctAnswer = sparse( - svec((3 â 1.0) :: (6 â 1.0) :: (8 â 1.0) :: (11 â 1.0) :: Nil, cardinality = 12), - svec((0 â 1.0) :: (4 â 1.0) :: (7 â 1.0) :: ( 9 â 1.0) :: Nil, cardinality = 12), - svec((1 â 1.0) :: (5 â 1.0) :: (8 â 1.0) :: (10 â 1.0) :: Nil, cardinality = 12) - ) - - val myAnswer = factoredA.collect - - val epsilon = 1E-6 - (myAnswer.norm - correctAnswer.norm) should be <= epsilon - (myAnswer.norm - correctAnswer.norm) should be <= epsilon - - } - - test("standard scaler test") { - /** - * R Prototype - * x <- matrix( c(1,2,3,1,5,9,5,-15,-2), nrow=3) - * scale(x, scale= apply(x, 2, sd) * sqrt(2/3)) - * # ^^ note: R uses degress of freedom = 1 for standard deviation calculations. - * # we don't (and neither does sklearn) - * # the *sqrt(N-1/N) 'undoes' the degrees of freedom = 1 - */ - - val A = drmParallelize(dense( - (1, 1, 5), - (2, 5, -15), - (3, 9, -2)), numPartitions = 2) - - val scaler: StandardScalerModel = new StandardScaler().fit(A) - - val correctAnswer = dense( - (-1.224745, -1.224745, -1.224745), - (0.000000, 0.000000, 1.224745), - (1.224745, 1.224745, 0.000000)) - - val myAnswer = scaler.transform(A).collect - println(scaler.meanVec) - println(scaler.stdev) - - val epsilon = 1E-6 - (myAnswer.norm - correctAnswer.norm) should be <= epsilon - - } - - test("mean center test") { - /** - * R Prototype - * - * x <- matrix( c(1.0,2.0,3.0,1.0,5.0,9.0,-2.0,2.0,0), nrow=3) - * centered.x <- scale(x, scale= FALSE) - * print(centered.x) - */ - - - val A = drmParallelize(dense( - (1, 1, -2), - (2, 5, 2), - (3, 9, 0)), numPartitions = 2) - - val scaler: MeanCenterModel = new MeanCenter().fit(A) - - val myAnswer = scaler.transform(A).collect - - val correctAnswer = dense( - (-1, -4, -2), - (0, 0, 2), - (1, 4, 0)) - - val epsilon = 1E-6 - (myAnswer.norm - correctAnswer.norm) should be <= epsilon - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuiteBase.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuiteBase.scala deleted file mode 100644 index 8910ae9..0000000 --- a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuiteBase.scala +++ /dev/null @@ -1,180 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.math.algorithms - -import org.apache.mahout.math.algorithms.regression._ -import org.apache.mahout.math.drm._ -import org.apache.mahout.math.drm.RLikeDrmOps._ -import org.apache.mahout.math.scalabindings._ -import org.apache.mahout.math.scalabindings.RLikeOps._ -import org.apache.mahout.test.DistributedMahoutSuite -import org.scalatest.{FunSuite, Matchers} - -trait RegressionSuiteBase extends DistributedMahoutSuite with Matchers { - this: FunSuite => - - val epsilon = 1E-6 - - test("ordinary least squares") { - /* - R Prototype: - dataM <- matrix( c(2, 2, 10.5, 10, 29.509541, - 1, 2, 12, 12, 18.042851, - 1, 1, 12, 13, 22.736446, - 2, 1, 11, 13, 32.207582, - 1, 2, 12, 11, 21.871292, - 2, 1, 16, 8, 36.187559, - 6, 2, 17, 1, 50.764999, - 3, 2, 13, 7, 40.400208, - 3, 3, 13, 4, 45.811716), nrow=9, ncol=5, byrow=TRUE) - - - X = dataM[, c(1,2,3,4)] - y = dataM[, c(5)] - - model <- lm(y ~ X ) - summary(model) - - */ - - val drmData = drmParallelize(dense( - (2, 2, 10.5, 10, 29.509541), // Apple Cinnamon Cheerios - (1, 2, 12, 12, 18.042851), // Cap'n'Crunch - (1, 1, 12, 13, 22.736446), // Cocoa Puffs - (2, 1, 11, 13, 32.207582), // Froot Loops - (1, 2, 12, 11, 21.871292), // Honey Graham Ohs - (2, 1, 16, 8, 36.187559), // Wheaties Honey Gold - (6, 2, 17, 1, 50.764999), // Cheerios - (3, 2, 13, 7, 40.400208), // Clusters - (3, 3, 13, 4, 45.811716)), numPartitions = 2) - - - val drmX = drmData(::, 0 until 4) - val drmY = drmData(::, 4 until 5) - - val model = new OrdinaryLeastSquares[Int]().fit(drmX, drmY, 'calcCommonStatistics â false) - - val estimate = model.beta - val Ranswers = dvec(-1.336265, -13.157702, -4.152654, -5.679908, 163.179329) - - val epsilon = 1E-6 - (estimate - Ranswers).sum should be < epsilon - - // TODO add test for S.E / pvalue - } - - test("cochrane-orcutt"){ - /* R Prototype: - library(orcutt) - - df = data.frame(t(data.frame( - c(20.96, 127.3), - c(21.40, 130.0), - c(21.96, 132.7), - c(21.52, 129.4), - c(22.39, 135.0), - c(22.76, 137.1), - c(23.48, 141.2), - c(23.66, 142.8), - c(24.10, 145.5), - c(24.01, 145.3), - c(24.54, 148.3), - c(24.30, 146.4), - c(25.00, 150.2), - c(25.64, 153.1), - c(26.36, 157.3), - c(26.98, 160.7), - c(27.52, 164.2), - c(27.78, 165.6), - c(28.24, 168.7), - c(28.78, 171.7)))) - - rownames(df) <- NULL - colnames(df) <- c("y", "x") - my_lm = lm(y ~ x, data=df) - coch = cochrane.orcutt(my_lm) - - /////////////////////////////////////// - The R-implementation is kind of...silly. - - The above works- converges at 318 iterations- the transformed DW is 1.72, yet the rho is - .95882. After 318 iteartions, this will also report a rho of .95882 (which sugguests SEVERE - autocorrelation- nothing close to 1.72. - - At anyrate, the real prototype for this is the example from Applied Linear Statistcal Models - 5th Edition by Kunter, Nachstheim, Neter, and Li. They also provide some interesting notes on p 494: - 1) "Cochrane-Orcutt does not always work properly. A major reason is that when the error terms - are positively autocorrelated, the estimate r in (12.22) tends to underestimate the autocorrelation - parameter rho. When this bias is serious, it can significantly reduce the effectiveness of the - Cochrane-Orcutt approach. - 2. There exists an approximate relation between the Durbin Watson test statistic D in (12.14) - and the estimated autocorrelation paramater r in (12.22): - D ~= 2(1-r)" - - They also note on p492: - "... If the process does not terminate after one or two iterations, a different procedure - should be employed." - This differs from the logic found elsewhere, and the method presented in R where, in the simple - example in the prototype, the procedure runs for 318 iterations. This is why the default - maximum iteratoins are 3, and should be left as such. - - Also, the prototype and 'correct answers' are based on the example presented in Kunter et. al on - p492-4 (including dataset). - - */ - - val alsmBlaisdellCo = drmParallelize( dense( - (20.96, 127.3), - (21.40, 130.0), - (21.96, 132.7), - (21.52, 129.4), - (22.39, 135.0), - (22.76, 137.1), - (23.48, 141.2), - (23.66, 142.8), - (24.10, 145.5), - (24.01, 145.3), - (24.54, 148.3), - (24.30, 146.4), - (25.00, 150.2), - (25.64, 153.1), - (26.36, 157.3), - (26.98, 160.7), - (27.52, 164.2), - (27.78, 165.6), - (28.24, 168.7), - (28.78, 171.7) )) - - val drmY = alsmBlaisdellCo(::, 0 until 1) - val drmX = alsmBlaisdellCo(::, 1 until 2) - - var coModel = new CochraneOrcutt[Int]().fit(drmX, drmY , ('iterations -> 2)) - val coResiduals = drmY - coModel.predict(drmX) - - val correctRho = 0.631166 - (coModel.rhos(1) - correctRho) should be < epsilon - - val shortEpsilon = 1E-4 // book rounded off pretty short - val correctBeta = dvec(0.17376, -1.0685) - (coModel.betas(1) - correctBeta).sum.abs < shortEpsilon - - val correctSe = dvec(0.002957, 0.45332) - (coModel.se - correctSe).sum.abs < shortEpsilon - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuiteBase.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuiteBase.scala deleted file mode 100644 index 57dffef..0000000 --- a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuiteBase.scala +++ /dev/null @@ -1,126 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.mahout.math.algorithms - -import org.apache.mahout.math.algorithms.regression.OrdinaryLeastSquares -import org.apache.mahout.math.algorithms.regression.tests._ -import org.apache.mahout.math.drm.{CheckpointedDrm, drmParallelize} -import org.apache.mahout.math.drm.RLikeDrmOps._ -import org.apache.mahout.math.scalabindings.{`::`, dense} -import org.apache.mahout.test.DistributedMahoutSuite -import org.scalatest.{FunSuite, Matchers} - - -trait RegressionTestsSuiteBase extends DistributedMahoutSuite with Matchers { - this: FunSuite => - - val epsilon = 1E-4 - - test("fittness tests") { - /* - R Prototype: - dataM <- matrix( c(2, 2, 10.5, 10, 29.509541, - 1, 2, 12, 12, 18.042851, - 1, 1, 12, 13, 22.736446, - 2, 1, 11, 13, 32.207582, - 1, 2, 12, 11, 21.871292, - 2, 1, 16, 8, 36.187559, - 6, 2, 17, 1, 50.764999, - 3, 2, 13, 7, 40.400208, - 3, 3, 13, 4, 45.811716), nrow=9, ncol=5, byrow=TRUE) - - - X = dataM[, c(1,2,3,4)] - y = dataM[, c(5)] - - model <- lm(y ~ X) - summary(model) - - */ - - val drmData = drmParallelize(dense( - (2, 2, 10.5, 10, 29.509541), // Apple Cinnamon Cheerios - (1, 2, 12, 12, 18.042851), // Cap'n'Crunch - (1, 1, 12, 13, 22.736446), // Cocoa Puffs - (2, 1, 11, 13, 32.207582), // Froot Loops - (1, 2, 12, 11, 21.871292), // Honey Graham Ohs - (2, 1, 16, 8, 36.187559), // Wheaties Honey Gold - (6, 2, 17, 1, 50.764999), // Cheerios - (3, 2, 13, 7, 40.400208), // Clusters - (3, 3, 13, 4, 45.811716)), numPartitions = 2) - - val drmX = drmData(::, 0 until 4) - val drmY = drmData(::, 4 until 5) - - val model = new OrdinaryLeastSquares[Int]().fit(drmX, drmY) - - println(model.summary) - // Answers from running similar algorithm in R - val rR2 = 0.9425 - val rMSE = 6.457157 - - val r2: Double = model.r2 - val mse: Double = model.mse - (rR2 - r2) should be < epsilon - (rMSE - mse) should be < epsilon - - Math.abs(model.beta.get(4) - 163.17933 ) should be < epsilon - Math.abs(model.beta.get(0) - (-1.33627) ) should be < epsilon - Math.abs(model.beta.get(1) - (-13.15770)) should be < epsilon - Math.abs(model.beta.get(2) - (-4.15265) ) should be < epsilon - Math.abs(model.beta.get(3) - (-5.679908)) should be < epsilon - - Math.abs(model.tScore.get(0) - (-0.49715717)) should be < epsilon - Math.abs(model.tScore.get(1) - (-2.43932888)) should be < epsilon - Math.abs(model.tScore.get(2) - (-2.32654000)) should be < epsilon - Math.abs(model.tScore.get(3) - (-3.01022444)) should be < epsilon - Math.abs(model.tScore.get(4) - 3.143183937 ) should be < epsilon - - model.degreesOfFreedom should equal(5) - model.trainingExamples should equal(9) - - Math.abs((model.fScore - 16.38542361)) should be < 0.0000001 - - } - - test("durbinWatsonTest test") { - /** - * R Prototype - * - * library(car) - * residuals <- seq(0, 4.9, 0.1) - * ## perform Durbin-Watson test - * durbinWatsonTest(residuals) - */ - - val correctAnswer = 0.001212121 - val err1 = drmParallelize( dense((0.0 until 5.0 by 0.1).toArray) ).t - val drmX = drmParallelize( dense((0 until 50).toArray.map( t => Math.pow(-1.0, t)) ) ).t - val drmY = drmX + err1 + 1 - var model = new OrdinaryLeastSquares[Int]().fit(drmX, drmY) - val syntheticResiduals = err1 - model = AutocorrelationTests.DurbinWatson(model, syntheticResiduals) - val myAnswer: Double = model.testResults.getOrElse('durbinWatsonTestStatistic, -1.0).asInstanceOf[Double] - (myAnswer - correctAnswer) should be < epsilon - } - - -} - http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/backend/BackendSuite.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/test/scala/org/apache/mahout/math/backend/BackendSuite.scala b/math-scala/src/test/scala/org/apache/mahout/math/backend/BackendSuite.scala deleted file mode 100644 index ba6e145..0000000 --- a/math-scala/src/test/scala/org/apache/mahout/math/backend/BackendSuite.scala +++ /dev/null @@ -1,59 +0,0 @@ -package org.apache.mahout.math.backend - -import org.apache.mahout.math.backend.jvm.JvmBackend -import org.scalatest.{FunSuite, Matchers} - -import scala.collection.mutable -import scala.reflect.{ClassTag, classTag} - -class BackendSuite extends FunSuite with Matchers { - - test("GenericBackend") { - - trait MySolverTrait1 { def myMethod1 = Unit } - - - trait MySolverTrait2 - - class MySolverImpl1 extends MySolverTrait1 { - } - - class MySolverImpl2 extends MySolverTrait2 - - // My dummy backend supporting to trait solvers filled with 2 dummy implementations of these - // traits should be able to serve based on their solver traits. - val myBackend = new Backend { - - override def isAvailable: Boolean = true - - override val solverMap = new mutable.HashMap[ClassTag[_], Any]() - - solverMap ++= Map( - classTag[MySolverTrait1] â new MySolverImpl1, - classTag[MySolverTrait2] â new MySolverImpl2 - ) - - validateMap() - } - - myBackend.getSolver shouldBe None - - val mySolver1 = myBackend.getSolver[MySolverTrait1] - - // This is indeed solver1 trait type: - mySolver1.get.myMethod1 - mySolver1.get.isInstanceOf[MySolverImpl1] shouldBe true - - // Validator should not allow non-subclasses in implementation. - an [IllegalArgumentException] mustBe thrownBy { - myBackend.solverMap(classTag[MySolverTrait2]) = 0 - myBackend.validateMap() - } - } - - test("JvmBackend") { - // Just create JVM backend and validate. - JvmBackend.validateMap() - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/decompositions/DecompositionsSuite.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/test/scala/org/apache/mahout/math/decompositions/DecompositionsSuite.scala b/math-scala/src/test/scala/org/apache/mahout/math/decompositions/DecompositionsSuite.scala deleted file mode 100644 index 8f5ec99..0000000 --- a/math-scala/src/test/scala/org/apache/mahout/math/decompositions/DecompositionsSuite.scala +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.math.decompositions - -import org.scalatest.FunSuite -import org.apache.mahout.test.MahoutSuite -import org.apache.mahout.common.RandomUtils -import org.apache.mahout.math._ -import scalabindings._ -import RLikeOps._ - -/** - * This suite tests only in-core decomposititions. - * <P> - * - * We moved distributed tests into mahout-spark module since they require a concrete distributed - * engine dependencies to run. - * <P> - */ -class DecompositionsSuite extends FunSuite with MahoutSuite { - - test("ssvd") { - - // Very naive, a full-rank only here. - val a = dense( - (1, 2, 3), - (3, 4, 5), - (-2, 6, 7), - (-3, 8, 9) - ) - - val rank = 2 - val (u, v, s) = ssvd(a, k = rank, q = 1) - - val (uControl, vControl, sControl) = svd(a) - - printf("U:\n%s\n", u) - printf("U-control:\n%s\n", uControl) - printf("V:\n%s\n", v) - printf("V-control:\n%s\n", vControl) - printf("Sigma:\n%s\n", s) - printf("Sigma-control:\n%s\n", sControl) - - (s - sControl(0 until rank)).norm(2) should be < 1E-7 - - // Singular vectors may be equivalent down to a sign only. - (u.norm - uControl(::, 0 until rank).norm).abs should be < 1E-7 - (v.norm - vControl(::, 0 until rank).norm).abs should be < 1E-7 - } - - test("spca") { - - import math._ - - val rnd = RandomUtils.getRandom - - // Number of points - val m = 500 - // Length of actual spectrum - val spectrumLen = 40 - - val spectrum = dvec((0 until spectrumLen).map(x => 300.0 * exp(-x) max 1e-3)) - printf("spectrum:%s\n", spectrum) - - val (u, _) = qr(new SparseRowMatrix(m, spectrumLen) := - ((r, c, v) => if (rnd.nextDouble() < 0.2) 0 else rnd.nextDouble() + 5.0)) - - // PCA Rotation matrix -- should also be orthonormal. - val (tr, _) = qr(Matrices.symmetricUniformView(spectrumLen, spectrumLen, rnd.nextInt) - 10.0) - - val input = (u %*%: diagv(spectrum)) %*% tr.t - - // Calculate just first 10 principal factors and reduce dimensionality. - // Since we assert just validity of the s-pca, not stochastic error, we bump p parameter to - // ensure to zero stochastic error and assert only functional correctness of the method's pca- - // specific additions. - val k = 10 - var (pca, _, s) = spca(a = input, k = k, p = spectrumLen, q = 1) - printf("Svs:%s\n", s) - // Un-normalized pca data: - pca = pca %*%: diagv(s) - - // Of course, once we calculated the pca, the spectrum is going to be different since our originally - // generated input was not centered. So here, we'd just brute-solve pca to verify - val xi = input.colMeans() - for (r <- 0 until input.nrow) input(r, ::) -= xi - var (pcaControl, _, sControl) = svd(m = input) - - printf("Svs-control:%s\n", sControl) - pcaControl = (pcaControl %*%: diagv(sControl))(::, 0 until k) - - printf("pca:\n%s\n", pca(0 until 10, 0 until 10)) - printf("pcaControl:\n%s\n", pcaControl(0 until 10, 0 until 10)) - - (pca(0 until 10, 0 until 10).norm - pcaControl(0 until 10, 0 until 10).norm).abs should be < 1E-5 - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/decompositions/DistributedDecompositionsSuiteBase.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/test/scala/org/apache/mahout/math/decompositions/DistributedDecompositionsSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/decompositions/DistributedDecompositionsSuiteBase.scala deleted file mode 100644 index de8228e..0000000 --- a/math-scala/src/test/scala/org/apache/mahout/math/decompositions/DistributedDecompositionsSuiteBase.scala +++ /dev/null @@ -1,219 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.math.decompositions - -import org.apache.mahout.test.DistributedMahoutSuite -import org.apache.mahout.math._ -import scalabindings._ -import RLikeOps._ -import drm._ -import RLikeDrmOps._ -import org.scalatest.{FunSuite, Matchers} -import org.apache.mahout.common.RandomUtils -import math._ - -/** - * ==Common distributed code to run against each distributed engine support.== - * - * Each distributed engine's decompositions package should have a suite that includes this feature - * as part of its distributed test suite. - * - */ -trait DistributedDecompositionsSuiteBase extends DistributedMahoutSuite with Matchers { this:FunSuite => - - - test("thin distributed qr") { - - val inCoreA = dense( - (1, 2, 3, 4), - (2, 3, 4, 5), - (3, -4, 5, 6), - (4, 5, 6, 7), - (8, 6, 7, 8) - ) - - val drmA = drmParallelize(inCoreA, numPartitions = 2) - val (drmQ, inCoreR) = dqrThin(drmA, checkRankDeficiency = false) - - // Assert optimizer still knows Q and A are identically partitioned - drmQ.partitioningTag should equal(drmA.partitioningTag) - -// drmQ.rdd.partitions.size should be(A.rdd.partitions.size) -// -// // Should also be zippable -// drmQ.rdd.zip(other = A.rdd) - - val inCoreQ = drmQ.collect - - printf("A=\n%s\n", inCoreA) - printf("Q=\n%s\n", inCoreQ) - printf("R=\n%s\n", inCoreR) - - val (qControl, rControl) = qr(inCoreA) - printf("qControl=\n%s\n", qControl) - printf("rControl=\n%s\n", rControl) - - // Validate with Cholesky - val ch = chol(inCoreA.t %*% inCoreA) - printf("A'A=\n%s\n", inCoreA.t %*% inCoreA) - printf("L:\n%s\n", ch.getL) - - val rControl2 = (ch.getL cloned).t - val qControl2 = ch.solveRight(inCoreA) - printf("qControl2=\n%s\n", qControl2) - printf("rControl2=\n%s\n", rControl2) - - // Householder approach seems to be a little bit more stable - (rControl - inCoreR).norm should be < 1E-5 - (qControl - inCoreQ).norm should be < 1E-5 - - // Assert identicity with in-core Cholesky-based -- this should be tighter. - (rControl2 - inCoreR).norm should be < 1E-10 - (qControl2 - inCoreQ).norm should be < 1E-10 - - // Assert orthogonality: - // (a) Q[,j] dot Q[,j] == 1.0 for all j - // (b) Q[,i] dot Q[,j] == 0.0 for all i != j - for (col <- 0 until inCoreQ.ncol) - ((inCoreQ(::, col) dot inCoreQ(::, col)) - 1.0).abs should be < 1e-10 - for (col1 <- 0 until inCoreQ.ncol - 1; col2 <- col1 + 1 until inCoreQ.ncol) - (inCoreQ(::, col1) dot inCoreQ(::, col2)).abs should be < 1e-10 - - - } - - test("dssvd - the naive-est - q=0") { - dssvdNaive(q = 0) - } - - test("ddsvd - naive - q=1") { - dssvdNaive(q = 1) - } - - test("ddsvd - naive - q=2") { - dssvdNaive(q = 2) - } - - - def dssvdNaive(q: Int) { - val inCoreA = dense( - (1, 2, 3, 4), - (2, 3, 4, 5), - (3, -4, 5, 6), - (4, 5, 6, 7), - (8, 6, 7, 8) - ) - val drmA = drmParallelize(inCoreA, numPartitions = 2) - - val (drmU, drmV, s) = dssvd(drmA, k = 4, q = q) - val (inCoreU, inCoreV) = (drmU.collect, drmV.collect) - - printf("U:\n%s\n", inCoreU) - printf("V:\n%s\n", inCoreV) - printf("Sigma:\n%s\n", s) - - (inCoreA - (inCoreU %*%: diagv(s)) %*% inCoreV.t).norm should be < 1E-5 - } - - test("dspca") { - - val rnd = RandomUtils.getRandom - - // Number of points - val m = 500 - // Length of actual spectrum - val spectrumLen = 40 - - val spectrum = dvec((0 until spectrumLen).map(x => 300.0 * exp(-x) max 1e-3)) - printf("spectrum:%s\n", spectrum) - - val (u, _) = qr(new SparseRowMatrix(m, spectrumLen) := - ((r, c, v) => if (rnd.nextDouble() < 0.2) 0 else rnd.nextDouble() + 5.0)) - - // PCA Rotation matrix -- should also be orthonormal. - val (tr, _) = qr(Matrices.symmetricUniformView(spectrumLen, spectrumLen, rnd.nextInt) - 10.0) - - val input = (u %*%: diagv(spectrum)) %*% tr.t - val drmInput = drmParallelize(m = input, numPartitions = 2) - - // Calculate just first 10 principal factors and reduce dimensionality. - // Since we assert just validity of the s-pca, not stochastic error, we bump p parameter to - // ensure to zero stochastic error and assert only functional correctness of the method's pca- - // specific additions. - val k = 10 - - // Calculate just first 10 principal factors and reduce dimensionality. - var (drmPCA, _, s) = dspca(drmA = drmInput, k = 10, p = spectrumLen, q = 1) - // Un-normalized pca data: - drmPCA = drmPCA %*% diagv(s) - - val pca = drmPCA.checkpoint(CacheHint.NONE).collect - - // Of course, once we calculated the pca, the spectrum is going to be different since our originally - // generated input was not centered. So here, we'd just brute-solve pca to verify - val xi = input.colMeans() - for (r <- 0 until input.nrow) input(r, ::) -= xi - var (pcaControl, _, sControl) = svd(m = input) - pcaControl = (pcaControl %*%: diagv(sControl))(::, 0 until k) - - printf("pca:\n%s\n", pca(0 until 10, 0 until 10)) - printf("pcaControl:\n%s\n", pcaControl(0 until 10, 0 until 10)) - - (pca(0 until 10, 0 until 10).norm - pcaControl(0 until 10, 0 until 10).norm).abs should be < 1E-5 - - } - - test("dals") { - - val rnd = RandomUtils.getRandom - - // Number of points - val m = 500 - val n = 500 - - // Length of actual spectrum - val spectrumLen = 40 - - // Create singluar values with decay - val spectrum = dvec((0 until spectrumLen).map(x => 300.0 * exp(-x) max 1e-3)) - printf("spectrum:%s\n", spectrum) - - // Create A as an ideal input - val inCoreA = (qr(Matrices.symmetricUniformView(m, spectrumLen, 1234))._1 %*%: diagv(spectrum)) %*% - qr(Matrices.symmetricUniformView(n, spectrumLen, 2345))._1.t - val drmA = drmParallelize(inCoreA, numPartitions = 2) - - // Decompose using ALS - val (drmU, drmV, rmse) = dals(drmA = drmA, k = 20).toTuple - val inCoreU = drmU.collect - val inCoreV = drmV.collect - - val predict = inCoreU %*% inCoreV.t - - printf("Control block:\n%s\n", inCoreA(0 until 3, 0 until 3)) - printf("ALS factorized approximation block:\n%s\n", predict(0 until 3, 0 until 3)) - - val err = (inCoreA - predict).norm - printf("norm of residuals %f\n", err) - printf("train iteration rmses: %s\n", rmse) - - err should be < 15e-2 - - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeOpsSuiteBase.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeOpsSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeOpsSuiteBase.scala deleted file mode 100644 index 525da11..0000000 --- a/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeOpsSuiteBase.scala +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.math.drm - -import org.apache.mahout.test.DistributedMahoutSuite -import org.scalatest.{FunSuite, Matchers} -import org.apache.mahout.math._ -import scalabindings._ -import RLikeOps._ -import RLikeDrmOps._ - -import scala.reflect.{ClassTag,classTag} - -/** Common tests for DrmLike operators to be executed by all distributed engines. */ -trait DrmLikeOpsSuiteBase extends DistributedMahoutSuite with Matchers { - this: FunSuite â - - test("mapBlock") { - - val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6)) - val A = drmParallelize(m = inCoreA, numPartitions = 2) - val B = A.mapBlock(/* Inherit width */) { - case (keys, block) â keys â (block += 1.0) - } - - val inCoreB = B.collect - val inCoreBControl = inCoreA + 1.0 - - println(inCoreB) - - // Assert they are the same - (inCoreB - inCoreBControl).norm should be < 1E-10 - B.keyClassTag shouldBe ClassTag.Int - - } - - test ("mapBlock implicit keying") { - - val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6)) - val A = drmParallelize(m = inCoreA, numPartitions = 2) - val B = A.mapBlock(/* Inherit width */) { - case (keys, block) â keys.map { k â k.toString } â block - } - - B.keyClassTag shouldBe classTag[String] - - } - - - test("allReduceBlock") { - - val mxA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6)) - val drmA = drmParallelize(mxA, numPartitions = 2) - - try { - val mxB = drmA.allreduceBlock { case (keys, block) â - block(::, 0 until 2).t %*% block(::, 2 until 3) - } - - val mxControl = mxA(::, 0 until 2).t %*% mxA(::, 2 until 3) - - (mxB - mxControl).norm should be < 1e-10 - - } catch { - case e: UnsupportedOperationException â // Some engines may not support this, so ignore. - } - - } - - test("col range") { - val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6)) - val A = drmParallelize(m = inCoreA, numPartitions = 2) - val B = A(::, 1 to 2) - val inCoreB = B.collect - val inCoreBControl = inCoreA(::, 1 to 2) - - println(inCoreB) - - // Assert they are the same - (inCoreB - inCoreBControl).norm should be < 1E-10 - - } - - test("row range") { - - val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6)) - val A = drmParallelize(m = inCoreA, numPartitions = 2) - val B = A(1 to 2, ::) - val inCoreB = B.collect - val inCoreBControl = inCoreA(1 to 2, ::) - - println(inCoreB) - - // Assert they are the same - (inCoreB - inCoreBControl).norm should be < 1E-10 - - } - - test("col, row range") { - - val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6)) - val A = drmParallelize(m = inCoreA, numPartitions = 2) - val B = A(1 to 2, 1 to 2) - val inCoreB = B.collect - val inCoreBControl = inCoreA(1 to 2, 1 to 2) - - println(inCoreB) - - // Assert they are the same - (inCoreB - inCoreBControl).norm should be < 1E-10 - - } - - test("dsqDist(X,Y)") { - val m = 100 - val n = 300 - val d = 7 - val mxX = Matrices.symmetricUniformView(m, d, 12345).cloned -= 5 - val mxY = Matrices.symmetricUniformView(n, d, 1234).cloned += 10 - val (drmX, drmY) = (drmParallelize(mxX, 3), drmParallelize(mxY, 4)) - - val mxDsq = dsqDist(drmX, drmY).collect - val mxDsqControl = new DenseMatrix(m, n) := { (r, c, _) â (mxX(r, ::) - mxY(c, ::)) ^= 2 sum } - (mxDsq - mxDsqControl).norm should be < 1e-7 - } - - test("dsqDist(X)") { - val m = 100 - val d = 7 - val mxX = Matrices.symmetricUniformView(m, d, 12345).cloned -= 5 - val drmX = drmParallelize(mxX, 3) - - val mxDsq = dsqDist(drmX).collect - val mxDsqControl = sqDist(drmX) - (mxDsq - mxDsqControl).norm should be < 1e-7 - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeSuiteBase.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeSuiteBase.scala deleted file mode 100644 index 41814d8..0000000 --- a/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeSuiteBase.scala +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.math.drm - -import org.apache.mahout.test.DistributedMahoutSuite -import org.scalatest.{FunSuite, Matchers} -import org.apache.mahout.math._ -import scalabindings._ -import RLikeOps._ -import scala.reflect.ClassTag - -/** Common DRM tests to be run by all distributed engines. */ -trait DrmLikeSuiteBase extends DistributedMahoutSuite with Matchers { - this: FunSuite => - - test("DRM DFS i/o (local)") { - - val uploadPath = TmpDir + "UploadedDRM" - - val inCoreA = dense((1, 2, 3), (3, 4, 5)) - val drmA = drmParallelize(inCoreA) - - drmA.dfsWrite(path = uploadPath) - - println(inCoreA) - - // Load back from hdfs - val drmB = drmDfsRead(path = uploadPath) - - // Make sure keys are correctly identified as ints - drmB.checkpoint(CacheHint.NONE).keyClassTag shouldBe ClassTag.Int - - // Collect back into in-core - val inCoreB = drmB.collect - - // Print out to see what it is we collected: - println(inCoreB) - - (inCoreA - inCoreB).norm should be < 1e-7 - } - - test("DRM parallelizeEmpty") { - - val drmEmpty = drmParallelizeEmpty(100, 50) - - // collect back into in-core - val inCoreEmpty = drmEmpty.collect - - inCoreEmpty.sum.abs should be < 1e-7 - drmEmpty.nrow shouldBe 100 - drmEmpty.ncol shouldBe 50 - inCoreEmpty.nrow shouldBe 100 - inCoreEmpty.ncol shouldBe 50 - - } - - - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/drm/RLikeDrmOpsSuiteBase.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/test/scala/org/apache/mahout/math/drm/RLikeDrmOpsSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/drm/RLikeDrmOpsSuiteBase.scala deleted file mode 100644 index 5d6d142..0000000 --- a/math-scala/src/test/scala/org/apache/mahout/math/drm/RLikeDrmOpsSuiteBase.scala +++ /dev/null @@ -1,655 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.math.drm - -import org.apache.mahout.test.DistributedMahoutSuite -import org.scalatest.{FunSuite, Matchers} -import org.apache.mahout.math._ -import scalabindings._ -import RLikeOps._ -import RLikeDrmOps._ -import decompositions._ -import org.apache.mahout.math.drm.logical._ -import org.apache.mahout.math.drm.logical.OpAtx -import org.apache.mahout.math.drm.logical.OpAtB -import org.apache.mahout.math.drm.logical.OpAtA -import org.apache.mahout.math.drm.logical.OpAewUnaryFuncFusion - -import scala.util.Random - -/** Common engine tests for distributed R-like DRM operations */ -trait RLikeDrmOpsSuiteBase extends DistributedMahoutSuite with Matchers { - this: FunSuite => - - val epsilon = 1E-5 - - test("A.t") { - - val inCoreA = dense((1, 2, 3), (3, 4, 5)) - - val A = drmParallelize(inCoreA) - - val inCoreAt = A.t.collect - - // Assert first norm of difference is less than error margin. - (inCoreAt - inCoreA.t).norm should be < epsilon - - } - - test("C = A %*% B") { - - val inCoreA = dense((1, 2), (3, 4)) - val inCoreB = dense((3, 5), (4, 6)) - - val A = drmParallelize(inCoreA, numPartitions = 2) - val B = drmParallelize(inCoreB, numPartitions = 2) - - // Actual - val inCoreCControl = inCoreA %*% inCoreB - - // Distributed operation - val C = A %*% B - val inCoreC = C.collect - println(inCoreC) - - (inCoreC - inCoreCControl).norm should be < 1E-10 - - // We also should be able to collect via implicit checkpoint - val inCoreC2 = C.collect - println(inCoreC2) - - (inCoreC2 - inCoreCControl).norm should be < 1E-10 - - } - - test("C = A %*% B mapBlock {}") { - - val inCoreA = dense((1, 2), (3, 4)) - val inCoreB = dense((3, 5), (4, 6)) - - val A = drmParallelize(inCoreA, numPartitions = 2).checkpoint() - val B = drmParallelize(inCoreB, numPartitions = 2).checkpoint() - - // Actual - val inCoreCControl = inCoreA %*% inCoreB - - A.colSums() - B.colSums() - - - val x = drmBroadcast(dvec(0, 0)) - val x2 = drmBroadcast(dvec(0, 0)) - // Distributed operation - val C = (B.t %*% A.t).t.mapBlock() { - case (keys, block) => - for (row <- 0 until block.nrow) block(row, ::) += x.value + x2 - keys -> block - } - - val inCoreC = C checkpoint CacheHint.NONE collect; - println(inCoreC) - - (inCoreC - inCoreCControl).norm should be < 1E-10 - - // We also should be able to collect via implicit checkpoint - val inCoreC2 = C.collect - println(inCoreC2) - - (inCoreC2 - inCoreCControl).norm should be < 1E-10 - - val inCoreQ = dqrThin(C)._1.collect - - printf("Q=\n%s\n", inCoreQ) - - // Assert unit-orthogonality - ((inCoreQ(::, 0) dot inCoreQ(::, 0)) - 1.0).abs should be < 1e-10 - (inCoreQ(::, 0) dot inCoreQ(::, 1)).abs should be < 1e-10 - - } - - test("C = A %*% B incompatible B keys") { - - val inCoreA = dense((1, 2), (3, 4)) - val inCoreB = dense((3, 5), (4, 6)) - - val A = drmParallelize(inCoreA, numPartitions = 2) - val B = drmParallelize(inCoreB, numPartitions = 2) - // Re-key B into DrmLike[String] instead of [Int] - .mapBlock()({ - case (keys, block) => keys.map(_.toString) -> block - }) - - val C = A %*% B - - intercept[IllegalArgumentException] { - // This plan must not compile - C.checkpoint() - } - } - - test("Spark-specific C = At %*% B , join") { - - val inCoreA = dense((1, 2), (3, 4), (-3, -5)) - val inCoreB = dense((3, 5), (4, 6), (0, 1)) - - val A = drmParallelize(inCoreA, numPartitions = 2) - val B = drmParallelize(inCoreB, numPartitions = 2) - - val C = A.t %*% B - - mahoutCtx.optimizerRewrite(C) should equal(OpAtB[Int](A, B)) - - val inCoreC = C.collect - val inCoreControlC = inCoreA.t %*% inCoreB - - (inCoreC - inCoreControlC).norm should be < 1E-10 - - } - - - test("C = At %*% B , join, String-keyed") { - - val inCoreA = dense((1, 2), (3, 4), (-3, -5)) - val inCoreB = dense((3, 5), (4, 6), (0, 1)) - - val A = drmParallelize(inCoreA, numPartitions = 2) - .mapBlock()({ - case (keys, block) => keys.map(_.toString) -> block - }) - - val B = drmParallelize(inCoreB, numPartitions = 2) - .mapBlock()({ - case (keys, block) => keys.map(_.toString) -> block - }) - - val C = A.t %*% B - - mahoutCtx.optimizerRewrite(C) should equal(OpAtB[String](A, B)) - - val inCoreC = C.collect - val inCoreControlC = inCoreA.t %*% inCoreB - - (inCoreC - inCoreControlC).norm should be < 1E-10 - - } - - test("C = At %*% B , zippable, String-keyed") { - - val inCoreA = dense((1, 2), (3, 4), (-3, -5)) - - val A = drmParallelize(inCoreA, numPartitions = 2) - .mapBlock()({ - case (keys, block) â keys.map(_.toString) â block - }) - - // Dense-A' x sparse-B used to produce error. We sparsify B here to test this as well. - val B = (A + 1.0).mapBlock() { case (keys, block) â - keys â (new SparseRowMatrix(block.nrow, block.ncol) := block) - } - - val C = A.t %*% B - - mahoutCtx.optimizerRewrite(C) should equal(OpAtB[String](A, B)) - - val inCoreC = C.collect - val inCoreControlC = inCoreA.t %*% (inCoreA + 1.0) - - (inCoreC - inCoreControlC).norm should be < 1E-10 - - } - - test ("C = A %*% B.t") { - - val inCoreA = dense((1, 2), (3, 4), (-3, -5)) - - val A = drmParallelize(inCoreA, numPartitions = 2) - - val B = A + 1.0 - - val C = A %*% B.t - - mahoutCtx.optimizerRewrite(C) should equal(OpABt[Int](A, B)) - - val inCoreC = C.collect - val inCoreControlC = inCoreA %*% (inCoreA + 1.0).t - - (inCoreC - inCoreControlC).norm should be < 1E-10 - - } - - test("C = A %*% inCoreB") { - - val inCoreA = dense((1, 2, 3), (3, 4, 5), (4, 5, 6), (5, 6, 7)) - val inCoreB = dense((3, 5, 7, 10), (4, 6, 9, 10), (5, 6, 7, 7)) - - val A = drmParallelize(inCoreA, numPartitions = 2) - val C = A %*% inCoreB - - val inCoreC = C.collect - val inCoreCControl = inCoreA %*% inCoreB - - println(inCoreC) - (inCoreC - inCoreCControl).norm should be < 1E-10 - - } - - test("C = inCoreA %*%: B") { - - val inCoreA = dense((1, 2, 3), (3, 4, 5), (4, 5, 6), (5, 6, 7)) - val inCoreB = dense((3, 5, 7, 10), (4, 6, 9, 10), (5, 6, 7, 7)) - - val B = drmParallelize(inCoreB, numPartitions = 2) - val C = inCoreA %*%: B - - val inCoreC = C.collect - val inCoreCControl = inCoreA %*% inCoreB - - println(inCoreC) - (inCoreC - inCoreCControl).norm should be < 1E-10 - - } - - test("C = A.t %*% A") { - val inCoreA = dense((1, 2, 3), (3, 4, 5), (4, 5, 6), (5, 6, 7)) - val A = drmParallelize(m = inCoreA, numPartitions = 2) - - val AtA = A.t %*% A - - // Assert optimizer detects square - mahoutCtx.optimizerRewrite(action = AtA) should equal(OpAtA(A)) - - val inCoreAtA = AtA.collect - val inCoreAtAControl = inCoreA.t %*% inCoreA - - (inCoreAtA - inCoreAtAControl).norm should be < 1E-10 - } - - test("C = A.t %*% A fat non-graph") { - // Hack the max in-mem size for this test - System.setProperty("mahout.math.AtA.maxInMemNCol", "540") - - val inCoreA = Matrices.uniformView(400, 550, 1234) - val A = drmParallelize(m = inCoreA, numPartitions = 2) - - val AtA = A.t %*% A - - // Assert optimizer detects square - mahoutCtx.optimizerRewrite(action = AtA) should equal(OpAtA(A)) - - val inCoreAtA = AtA.collect - val inCoreAtAControl = inCoreA.t %*% inCoreA - - (inCoreAtA - inCoreAtAControl).norm should be < 1E-10 - } - - test("C = A.t %*% A non-int key") { - val inCoreA = dense((1, 2, 3), (3, 4, 5), (4, 5, 6), (5, 6, 7)) - val AintKeyd = drmParallelize(m = inCoreA, numPartitions = 2) - val A = AintKeyd.mapBlock() { - case (keys, block) => keys.map(_.toString) -> block - } - - val AtA = A.t %*% A - - // Assert optimizer detects square - mahoutCtx.optimizerRewrite(action = AtA) should equal(OpAtA(A)) - - val inCoreAtA = AtA.collect - val inCoreAtAControl = inCoreA.t %*% inCoreA - - (inCoreAtA - inCoreAtAControl).norm should be < 1E-10 - } - - test("C = A + B") { - - val inCoreA = dense((1, 2), (3, 4)) - val inCoreB = dense((3, 5), (4, 6)) - - val A = drmParallelize(inCoreA, numPartitions = 2) - val B = drmParallelize(inCoreB, numPartitions = 2) - - val C = A + B - val inCoreC = C.collect - - // Actual - val inCoreCControl = inCoreA + inCoreB - - (inCoreC - inCoreCControl).norm should be < 1E-10 - } - - test("C = A + B, identically partitioned") { - - val inCoreA = dense((1, 2, 3), (3, 4, 5), (5, 6, 7)) - - val A = drmParallelize(inCoreA, numPartitions = 2) - -// printf("A.nrow=%d.\n", A.rdd.count()) - - // Create B which would be identically partitioned to A. mapBlock() by default will do the trick. - val B = A.mapBlock() { - case (keys, block) => - val bBlock = block.like() := { (r, c, v) => util.Random.nextDouble()} - keys -> bBlock - } - // Prevent repeated computation non-determinism - // removing this checkpoint() will cause the same error in spark Tests - // as we're seeing in Flink with this test. ie util.Random.nextDouble() - // is being called more than once (note that it is not seeded in the closure) - .checkpoint() - - val inCoreB = B.collect - - printf("A=\n%s\n", inCoreA) - printf("B=\n%s\n", inCoreB) - - val C = A + B - - val inCoreC = C.collect - - printf("C=\n%s\n", inCoreC) - - // Actual - val inCoreCControl = inCoreA + inCoreB - - (inCoreC - inCoreCControl).norm should be < 1E-10 - } - - - test("C = A + B side test 1") { - - val inCoreA = dense((1, 2), (3, 4)) - val inCoreB = dense((3, 5), (4, 6)) - - val A = drmParallelize(inCoreA, numPartitions = 2) - val B = drmParallelize(inCoreB, numPartitions = 2) - - val C = A + B - val inCoreC = C.collect - - val inCoreD = (A + B).collect - - // Actual - val inCoreCControl = inCoreA + inCoreB - - (inCoreC - inCoreCControl).norm should be < 1E-10 - (inCoreD - inCoreCControl).norm should be < 1E-10 - } - - test("C = A + B side test 2") { - - val inCoreA = dense((1, 2), (3, 4)) - val inCoreB = dense((3, 5), (4, 6)) - - val A = drmParallelize(inCoreA, numPartitions = 2).checkpoint() - val B = drmParallelize(inCoreB, numPartitions = 2) - - val C = A + B - val inCoreC = C.collect - - val inCoreD = (A + B).collect - - // Actual - val inCoreCControl = inCoreA + inCoreB - - (inCoreC - inCoreCControl).norm should be < 1E-10 - (inCoreD - inCoreCControl).norm should be < 1E-10 - } - - test("C = A + B side test 3") { - - val inCoreA = dense((1, 2), (3, 4)) - val inCoreB = dense((3, 5), (4, 6)) - - val B = drmParallelize(inCoreB, numPartitions = 2) - // val A = (drmParallelize(inCoreA, numPartitions = 2) + B).checkpoint(CacheHint.MEMORY_ONLY_SER) - val A = (drmParallelize(inCoreA, numPartitions = 2) + B).checkpoint(CacheHint.MEMORY_ONLY) - - val C = A + B - val inCoreC = C.collect - - val inCoreD = (A + B).collect - - // Actual - val inCoreCControl = inCoreA + inCoreB * 2.0 - - (inCoreC - inCoreCControl).norm should be < 1E-10 - (inCoreD - inCoreCControl).norm should be < 1E-10 - } - - test("Ax") { - val inCoreA = dense( - (1, 2), - (3, 4), - (20, 30) - ) - val x = dvec(10, 3) - - val drmA = drmParallelize(inCoreA, numPartitions = 2) - - val ax = (drmA %*% x).collect(::, 0) - - ax should equal(inCoreA %*% x) - } - - test("A'x") { - val inCoreA = dense( - (1, 2), - (3, 4), - (20, 30) - ) - val x = dvec(10, 3, 4) - - val drmA = drmParallelize(inCoreA, numPartitions = 2) - - mahoutCtx.optimizerRewrite(drmA.t %*% x) should equal(OpAtx(drmA, x)) - - val atx = (drmA.t %*% x).collect(::, 0) - - atx should equal(inCoreA.t %*% x) - } - - test("colSums, colMeans") { - val inCoreA = dense( - (1, 2), - (3, 4), - (20, 30) - ) - val drmA = drmParallelize(inCoreA, numPartitions = 2) - - drmA.colSums() should equal(inCoreA.colSums()) - drmA.colMeans() should equal(inCoreA.colMeans()) - } - - test("rowSums, rowMeans") { - val inCoreA = dense( - (1, 2), - (3, 4), - (20, 30) - ) - val drmA = drmParallelize(inCoreA, numPartitions = 2) - - drmA.rowSums() should equal(inCoreA.rowSums()) - drmA.rowMeans() should equal(inCoreA.rowMeans()) - } - - test("A.diagv") { - val inCoreA = dense( - (1, 2, 3), - (3, 4, 5), - (20, 30, 7) - ) - val drmA = drmParallelize(inCoreA, numPartitions = 2) - - drmA.diagv should equal(inCoreA.diagv) - } - - test("numNonZeroElementsPerColumn") { - val inCoreA = dense( - (0, 2), - (3, 0), - (0, -30) - - ) - val drmA = drmParallelize(inCoreA, numPartitions = 2) - - drmA.numNonZeroElementsPerColumn() should equal(inCoreA.numNonZeroElementsPerColumn()) - } - - test("C = A cbind B, cogroup") { - - val inCoreA = dense((1, 2), (3, 4)) - val inCoreB = dense((3, 5), (4, 6)) - val controlC = dense((1, 2, 3, 5), (3, 4, 4, 6)) - - val A = drmParallelize(inCoreA, numPartitions = 2).checkpoint() - val B = drmParallelize(inCoreB, numPartitions = 2).checkpoint() - - (A.cbind(B) -: controlC).norm should be < 1e-10 - - } - - test("C = A cbind B, zip") { - - val inCoreA = dense((1, 2), (3, 4)) - val controlC = dense((1, 2, 2, 3), (3, 4, 4, 5)) - - val A = drmParallelize(inCoreA, numPartitions = 2).checkpoint() - - (A.cbind(A + 1.0) -: controlC).norm should be < 1e-10 - - } - - test("B = 1 cbind A") { - val inCoreA = dense((1, 2), (3, 4)) - val control = dense((1, 1, 2), (1, 3, 4)) - - val drmA = drmParallelize(inCoreA, numPartitions = 2) - - (control - (1 cbind drmA) ).norm should be < 1e-10 - } - - test("B = A cbind 1") { - val inCoreA = dense((1, 2), (3, 4)) - val control = dense((1, 2, 1), (3, 4, 1)) - - val drmA = drmParallelize(inCoreA, numPartitions = 2) - - (control - (drmA cbind 1) ).norm should be < 1e-10 - } - - test("B = A + 1.0") { - val inCoreA = dense((1, 2), (2, 3), (3, 4)) - val controlB = inCoreA + 1.0 - - val drmB = drmParallelize(m = inCoreA, numPartitions = 2) + 1.0 - - (drmB -: controlB).norm should be < 1e-10 - } - - test("C = A rbind B") { - - val inCoreA = dense((1, 2), (3, 5)) - val inCoreB = dense((7, 11), (13, 17)) - val controlC = dense((1, 2), (3, 5), (7, 11), (13, 17)) - - val A = drmParallelize(inCoreA, numPartitions = 2).checkpoint() - val B = drmParallelize(inCoreB, numPartitions = 2).checkpoint() - - (A.rbind(B) -: controlC).norm should be < 1e-10 - } - - test("C = A rbind B, with empty") { - - val inCoreA = dense((1, 2), (3, 5)) - val emptyB = drmParallelizeEmpty(nrow = 2, ncol = 2, numPartitions = 2) - val controlC = dense((1, 2), (3, 5), (0, 0), (0, 0)) - - val A = drmParallelize(inCoreA, numPartitions = 2).checkpoint() - - (A.rbind(emptyB) -: controlC).norm should be < 1e-10 - } - - /** Test dsl overloads over scala operations over matrices */ - test("scalarOps") { - val drmA = drmParallelize(m = dense( - (1, 2, 3), - (3, 4, 5), - (7, 8, 9) - ), - numPartitions = 2) - - (10 * drmA - (10 *: drmA)).norm shouldBe 0 - - } - - test("A * A -> sqr(A) rewrite ") { - val mxA = dense( - (1, 2, 3), - (3, 4, 5), - (7, 8, 9) - ) - - val mxAAControl = mxA * mxA - - val drmA = drmParallelize(mxA, 2) - val drmAA = drmA * drmA - - val optimized = drmAA.context.engine.optimizerRewrite(drmAA) - println(s"optimized:$optimized") - optimized.isInstanceOf[OpAewUnaryFunc[Int]] shouldBe true - - (mxAAControl -= drmAA).norm should be < 1e-10 - } - - test("B = 1 + 2 * (A * A) ew unary function fusion") { - val mxA = dense( - (1, 2, 3), - (3, 0, 5) - ) - val controlB = mxA.cloned := { (x) => 1 + 2 * x * x} - - val drmA = drmParallelize(mxA, 2) - - // We need to use parenthesis, otherwise optimizer will see it as (2A) * (A) and that would not - // be rewritten as 2 * sqr(A). It is not that clever (yet) to try commutativity optimizations. - val drmB = 1 + 2 * (drmA * drmA) - - val optimized = mahoutCtx.engine.optimizerRewrite(drmB) - println(s"optimizer rewritten:$optimized") - optimized.isInstanceOf[OpAewUnaryFuncFusion[Int]] shouldBe true - - (controlB - drmB).norm should be < 1e-10 - - } - - test("functional apply()") { - val mxA = sparse ( - (1 -> 3) :: (7 -> 7) :: Nil, - (4 -> 5) :: (5 -> 8) :: Nil - ) - - val mxAControl = mxA cloned - val drmA = drmParallelize(mxA) - - (drmA(x => x + 1).collect - (mxAControl + 1)).norm should be < 1e-7 - (drmA(x => x * 2).collect - (2 * mxAControl)).norm should be < 1e-7 - - } - - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MahoutCollectionsSuite.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MahoutCollectionsSuite.scala b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MahoutCollectionsSuite.scala deleted file mode 100644 index cf62eea..0000000 --- a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MahoutCollectionsSuite.scala +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.math.scalabindings - -import org.apache.mahout.math.Vector -import org.apache.mahout.test.MahoutSuite -import org.scalatest.FunSuite -import org.apache.mahout.math.scalabindings.MahoutCollections._ -import org.apache.mahout.math._ -import org.apache.mahout.math.scalabindings.RLikeOps._ - -class MahoutCollectionsSuite extends FunSuite with MahoutSuite { - test("toArray") { - val a = Array(1.0, 2.0, 3.0) - val v: Vector = new org.apache.mahout.math.DenseVector(a) - - v.toArray.deep shouldBe a.deep - - } - - test("toMap") { - val m = Map( (1 -> 1.0), (3 -> 3.0)) - val sv = svec(m) - - sv.toMap shouldBe m - } -}
