mahout#47

pat Thu, 28 Aug 2014 11:00:28 -0700

Repository: mahout
Updated Branches:
  refs/heads/master 91f15ecfe -> 149c98592



http://git-wip-us.apache.org/repos/asf/mahout/blob/149c9859/spark/src/test/scala/org/apache/mahout/cf/CooccurrenceAnalysisSuite.scala
----------------------------------------------------------------------
diff --git 
a/spark/src/test/scala/org/apache/mahout/cf/CooccurrenceAnalysisSuite.scala 
b/spark/src/test/scala/org/apache/mahout/cf/CooccurrenceAnalysisSuite.scala
index 642e90a..29c7b84 100644
--- a/spark/src/test/scala/org/apache/mahout/cf/CooccurrenceAnalysisSuite.scala
+++ b/spark/src/test/scala/org/apache/mahout/cf/CooccurrenceAnalysisSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.mahout.cf
 
-import org.apache.mahout.math.cf.CooccurrenceAnalysis
+import org.apache.mahout.math.cf.SimilarityAnalysis
 import org.apache.mahout.math.drm._
 import org.apache.mahout.math.scalabindings.{MatrixOps, _}
 import org.apache.mahout.sparkbindings.test.DistributedSparkSuite
@@ -81,7 +81,7 @@ class CooccurrenceAnalysisSuite extends FunSuite with 
MahoutSuite with Distribut
     val drmB = drmParallelize(m = b, numPartitions = 2)
 
     //self similarity
-    val drmCooc = CooccurrenceAnalysis.cooccurrences(drmARaw = drmA, 
randomSeed = 1, drmBs = Array(drmB))
+    val drmCooc = SimilarityAnalysis.cooccurrences(drmARaw = drmA, randomSeed 
= 1, drmBs = Array(drmB))
     val matrixSelfCooc = drmCooc(0).checkpoint().collect
     val diffMatrix = matrixSelfCooc.minus(matrixLLRCoocAtAControl)
     var n = (new MatrixOps(m = diffMatrix)).norm
@@ -112,7 +112,7 @@ class CooccurrenceAnalysisSuite extends FunSuite with 
MahoutSuite with Distribut
     val drmB = drmParallelize(m = b, numPartitions = 2)
 
     //self similarity
-    val drmCooc = CooccurrenceAnalysis.cooccurrences(drmARaw = drmA, drmBs = 
Array(drmB))
+    val drmCooc = SimilarityAnalysis.cooccurrences(drmARaw = drmA, drmBs = 
Array(drmB))
     val matrixSelfCooc = drmCooc(0).checkpoint().collect
     val diffMatrix = matrixSelfCooc.minus(matrixLLRCoocAtAControl)
     var n = (new MatrixOps(m = diffMatrix)).norm
@@ -142,7 +142,7 @@ class CooccurrenceAnalysisSuite extends FunSuite with 
MahoutSuite with Distribut
     val drmB = drmParallelize(m = b, numPartitions = 2)
 
    //self similarity
-    val drmCooc = CooccurrenceAnalysis.cooccurrences(drmARaw = drmA, drmBs = 
Array(drmB))
+    val drmCooc = SimilarityAnalysis.cooccurrences(drmARaw = drmA, drmBs = 
Array(drmB))
     //var cp = drmSelfCooc(0).checkpoint()
     //cp.writeDRM("/tmp/cooc-spark/")//to get values written
     val matrixSelfCooc = drmCooc(0).checkpoint().collect
@@ -181,7 +181,7 @@ class CooccurrenceAnalysisSuite extends FunSuite with 
MahoutSuite with Distribut
     val drmB = drmParallelize(m = b, numPartitions = 2)
 
     //self similarity
-    val drmCooc = CooccurrenceAnalysis.cooccurrences(drmARaw = drmA, drmBs = 
Array(drmB))
+    val drmCooc = SimilarityAnalysis.cooccurrences(drmARaw = drmA, drmBs = 
Array(drmB))
     val matrixSelfCooc = drmCooc(0).checkpoint().collect
     val diffMatrix = matrixSelfCooc.minus(matrixLLRCoocAtAControl)
     var n = (new MatrixOps(m = diffMatrix)).norm
@@ -227,21 +227,6 @@ class CooccurrenceAnalysisSuite extends FunSuite with 
MahoutSuite with Distribut
          (1, 0, 1, 0, 0),
          (0, 0, 0, 0, 0))
 
-        for (MatrixSlice row : cooccurrence) {
-            for (Vector.Element element : row.vector().nonZeroes()) {
-                long k11 = (long) element.get();// = 1
-                long k12 = (long) (rowSums.get(row.index()) - k11);// = 0
-                long k21 = (long) (colSums.get(element.index()) - k11);// = 1
-                long k22 = (long) (total - k11 - k12 - k21);// = 2
-                double score = LogLikelihood.rootLogLikelihoodRatio(k11, k12, 
k21, k22);
-                element.set(score);
-            }
-        }
-
-        for some reason the hadoop version returns the following
-        return 1.0 - 1.0 / (1.0 + logLikelihood);
-        so not a pure llr or root llr
-
     */
 
     //item (1,0)
@@ -250,7 +235,7 @@ class CooccurrenceAnalysisSuite extends FunSuite with 
MahoutSuite with Distribut
     val numInteractionsWithB = 2L
     val numInteractions = 6l
 
-    val llr = CooccurrenceAnalysis.logLikelihoodRatio(numInteractionsWithA, 
numInteractionsWithB, numInteractionsWithAandB, numInteractions)
+    val llr = SimilarityAnalysis.logLikelihoodRatio(numInteractionsWithA, 
numInteractionsWithB, numInteractionsWithAandB, numInteractions)
 
     assert(llr == 2.6341457841558764) // value calculated by hadoop 
itemsimilairty
   }
@@ -263,7 +248,7 @@ class CooccurrenceAnalysisSuite extends FunSuite with 
MahoutSuite with Distribut
         (1, 1, 0, 1, 0))
     val drmA: DrmLike[Int] = drmParallelize(m = a, numPartitions = 2)
 
-    val downSampledDrm = CooccurrenceAnalysis.sampleDownAndBinarize(drmA, 
0xdeadbeef, 4)
+    val downSampledDrm = SimilarityAnalysis.sampleDownAndBinarize(drmA, 
0xdeadbeef, 4)
     //count non-zero values, should be == 7
     var numValues = 0
     val m = downSampledDrm.collect

http://git-wip-us.apache.org/repos/asf/mahout/blob/149c9859/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala
----------------------------------------------------------------------
diff --git 
a/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala
 
b/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala
index 4bf1662..0a73469 100644
--- 
a/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala
+++ 
b/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala
@@ -26,6 +26,7 @@ import org.apache.mahout.math.drm._
 import org.apache.mahout.math.scalabindings._
 
 //todo: take out, only for temp tests
+
 import org.apache.mahout.math.scalabindings._
 import RLikeOps._
 import org.apache.mahout.math.drm._
@@ -33,53 +34,53 @@ import RLikeDrmOps._
 import scala.collection.JavaConversions._
 
 
-class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite  {
+class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
 
-/*
-  final val matrixLLRCoocAtAControl = dense(
-    (0.0,                0.6331745808516107, 0.0,                     0.0,     
           0.0),
-    (0.6331745808516107, 0.0,                0.0,                     0.0,     
           0.0),
-    (0.0,                0.0,                0.0,                     
0.6331745808516107, 0.0),
-    (0.0,                0.0,                0.6331745808516107,      0.0,     
           0.0),
-    (0.0,                0.0,                0.0,                     0.0,     
           0.0))
-
-  final val matrixLLRCoocBtAControl = dense(
-      (1.7260924347106847, 1.7260924347106847, 1.7260924347106847, 
1.7260924347106847, 0.0),
-      (0.6795961471815897, 0.6795961471815897, 0.6795961471815897, 
0.6795961471815897, 0.0),
-      (0.6795961471815897, 0.6795961471815897, 0.6795961471815897, 
0.6795961471815897, 0.0),
-      (1.7260924347106847, 1.7260924347106847, 1.7260924347106847, 
1.7260924347106847, 0.0),
-      (0.0,                0.0,                0.6795961471815897, 0.0,        
        4.498681156950466))
-*/
+  /*
+    final val matrixLLRCoocAtAControl = dense(
+      (0.0,                0.6331745808516107, 0.0,                     0.0,   
             0.0),
+      (0.6331745808516107, 0.0,                0.0,                     0.0,   
             0.0),
+      (0.0,                0.0,                0.0,                     
0.6331745808516107, 0.0),
+      (0.0,                0.0,                0.6331745808516107,      0.0,   
             0.0),
+      (0.0,                0.0,                0.0,                     0.0,   
             0.0))
+
+    final val matrixLLRCoocBtAControl = dense(
+        (1.7260924347106847, 1.7260924347106847, 1.7260924347106847, 
1.7260924347106847, 0.0),
+        (0.6795961471815897, 0.6795961471815897, 0.6795961471815897, 
0.6795961471815897, 0.0),
+        (0.6795961471815897, 0.6795961471815897, 0.6795961471815897, 
0.6795961471815897, 0.0),
+        (1.7260924347106847, 1.7260924347106847, 1.7260924347106847, 
1.7260924347106847, 0.0),
+        (0.0,                0.0,                0.6795961471815897, 0.0,      
          4.498681156950466))
+  */
 
 
   final val SelfSimilairtyLines = Iterable(
-      "galaxy\tnexus:1.7260924347106847",
-      "ipad\tiphone:1.7260924347106847",
-      "nexus\tgalaxy:1.7260924347106847",
-      "iphone\tipad:1.7260924347106847",
-      "surface")
+    "galaxy\tnexus:1.7260924347106847",
+    "ipad\tiphone:1.7260924347106847",
+    "nexus\tgalaxy:1.7260924347106847",
+    "iphone\tipad:1.7260924347106847",
+    "surface")
 
   val CrossIndicatorLines = Iterable(
-      "iphone\tnexus:1.7260924347106847 iphone:1.7260924347106847 
ipad:1.7260924347106847 galaxy:1.7260924347106847",
-      "ipad\tnexus:0.6795961471815897 iphone:0.6795961471815897 
ipad:0.6795961471815897 galaxy:0.6795961471815897",
-      "nexus\tnexus:0.6795961471815897 iphone:0.6795961471815897 
ipad:0.6795961471815897 galaxy:0.6795961471815897",
-      "galaxy\tnexus:1.7260924347106847 iphone:1.7260924347106847 
ipad:1.7260924347106847 galaxy:1.7260924347106847",
-      "surface\tsurface:4.498681156950466 nexus:0.6795961471815897")
+    "iphone\tnexus:1.7260924347106847 iphone:1.7260924347106847 
ipad:1.7260924347106847 galaxy:1.7260924347106847",
+    "ipad\tnexus:0.6795961471815897 iphone:0.6795961471815897 
ipad:0.6795961471815897 galaxy:0.6795961471815897",
+    "nexus\tnexus:0.6795961471815897 iphone:0.6795961471815897 
ipad:0.6795961471815897 galaxy:0.6795961471815897",
+    "galaxy\tnexus:1.7260924347106847 iphone:1.7260924347106847 
ipad:1.7260924347106847 galaxy:1.7260924347106847",
+    "surface\tsurface:4.498681156950466 nexus:0.6795961471815897")
 
   // todo: a better test would be to sort each vector by itemID and compare 
rows, tokens misses some error cases
   final val SelfSimilairtyTokens = tokenize(Iterable(
-      "galaxy\tnexus:1.7260924347106847",
-      "ipad\tiphone:1.7260924347106847",
-      "nexus\tgalaxy:1.7260924347106847",
-      "iphone\tipad:1.7260924347106847",
-      "surface"))
+    "galaxy\tnexus:1.7260924347106847",
+    "ipad\tiphone:1.7260924347106847",
+    "nexus\tgalaxy:1.7260924347106847",
+    "iphone\tipad:1.7260924347106847",
+    "surface"))
 
   val CrossIndicatorTokens = tokenize(Iterable(
-      "iphone\tnexus:1.7260924347106847 iphone:1.7260924347106847 
ipad:1.7260924347106847 galaxy:1.7260924347106847",
-      "ipad\tnexus:0.6795961471815897 iphone:0.6795961471815897 
ipad:0.6795961471815897 galaxy:0.6795961471815897",
-      "nexus\tnexus:0.6795961471815897 iphone:0.6795961471815897 
ipad:0.6795961471815897 galaxy:0.6795961471815897",
-      "galaxy\tnexus:1.7260924347106847 iphone:1.7260924347106847 
ipad:1.7260924347106847 galaxy:1.7260924347106847",
-      "surface\tsurface:4.498681156950466 nexus:0.6795961471815897"))
+    "iphone\tnexus:1.7260924347106847 iphone:1.7260924347106847 
ipad:1.7260924347106847 galaxy:1.7260924347106847",
+    "ipad\tnexus:0.6795961471815897 iphone:0.6795961471815897 
ipad:0.6795961471815897 galaxy:0.6795961471815897",
+    "nexus\tnexus:0.6795961471815897 iphone:0.6795961471815897 
ipad:0.6795961471815897 galaxy:0.6795961471815897",
+    "galaxy\tnexus:1.7260924347106847 iphone:1.7260924347106847 
ipad:1.7260924347106847 galaxy:1.7260924347106847",
+    "surface\tsurface:4.498681156950466 nexus:0.6795961471815897"))
 
   // now in MahoutSuite
   // final val TmpDir = "tmp/" // all IO going to whatever the default HDFS 
config is pointing to
@@ -113,32 +114,32 @@ class ItemSimilarityDriverSuite extends FunSuite with 
DistributedSparkSuite  {
     ))
   */
 
-  test ("ItemSimilarityDriver, non-full-spec CSV"){
+  test("ItemSimilarityDriver, non-full-spec CSV") {
 
     val InFile = TmpDir + "in-file.csv/" //using part files, not single file
     val OutPath = TmpDir + "indicator-matrices/"
 
     val lines = Array(
-        "u1,purchase,iphone",
-        "u1,purchase,ipad",
-        "u2,purchase,nexus",
-        "u2,purchase,galaxy",
-        "u3,purchase,surface",
-        "u4,purchase,iphone",
-        "u4,purchase,galaxy",
-        "u1,view,iphone",
-        "u1,view,ipad",
-        "u1,view,nexus",
-        "u1,view,galaxy",
-        "u2,view,iphone",
-        "u2,view,ipad",
-        "u2,view,nexus",
-        "u2,view,galaxy",
-        "u3,view,surface",
-        "u3,view,nexus",
-        "u4,view,iphone",
-        "u4,view,ipad",
-        "u4,view,galaxy")
+      "u1,purchase,iphone",
+      "u1,purchase,ipad",
+      "u2,purchase,nexus",
+      "u2,purchase,galaxy",
+      "u3,purchase,surface",
+      "u4,purchase,iphone",
+      "u4,purchase,galaxy",
+      "u1,view,iphone",
+      "u1,view,ipad",
+      "u1,view,nexus",
+      "u1,view,galaxy",
+      "u2,view,iphone",
+      "u2,view,ipad",
+      "u2,view,nexus",
+      "u2,view,galaxy",
+      "u3,view,surface",
+      "u3,view,nexus",
+      "u4,view,iphone",
+      "u4,view,ipad",
+      "u4,view,galaxy")
 
     // this will create multiple part-xxxxx files in the InFile dir but other 
tests will
     // take account of one actual file
@@ -146,22 +147,22 @@ class ItemSimilarityDriverSuite extends FunSuite with 
DistributedSparkSuite  {
 
     // local multi-threaded Spark with default HDFS
     ItemSimilarityDriver.main(Array(
-        "--input", InFile,
-        "--output", OutPath,
-        "--master", masterUrl,
-        "--filter1", "purchase",
-        "--filter2", "view",
-        "--inDelim", ",",
-        "--itemIDPosition", "2",
-        "--rowIDPosition", "0",
-        "--filterPosition", "1",
-        "--writeAllDatasets"))
+      "--input", InFile,
+      "--output", OutPath,
+      "--master", masterUrl,
+      "--filter1", "purchase",
+      "--filter2", "view",
+      "--inDelim", ",",
+      "--itemIDPosition", "2",
+      "--rowIDPosition", "0",
+      "--filterPosition", "1",
+      "--writeAllDatasets"))
 
     // todo: these comparisons rely on a sort producing the same lines, which 
could possibly
     // fail since the sort is on value and these can be the same for all items 
in a vector
-    val indicatorLines = 
mahoutCtx.textFile(OutPath+"/indicator-matrix/").collect.toIterable
+    val indicatorLines = mahoutCtx.textFile(OutPath + 
"/indicator-matrix/").collect.toIterable
     tokenize(indicatorLines) should contain theSameElementsAs 
SelfSimilairtyTokens
-    val crossIndicatorLines = 
mahoutCtx.textFile(OutPath+"/cross-indicator-matrix/").collect.toIterable
+    val crossIndicatorLines = mahoutCtx.textFile(OutPath + 
"/cross-indicator-matrix/").collect.toIterable
     tokenize(crossIndicatorLines) should contain theSameElementsAs 
CrossIndicatorTokens
   }
 
@@ -173,26 +174,26 @@ class ItemSimilarityDriverSuite extends FunSuite with 
DistributedSparkSuite  {
     val OutPath = TmpDir + "indicator-matrices/"
 
     val lines = Array(
-        "u1\tpurchase\tiphone",
-        "u1\tpurchase\tipad",
-        "u2\tpurchase\tnexus",
-        "u2\tpurchase\tgalaxy",
-        "u3\tpurchase\tsurface",
-        "u4\tpurchase\tiphone",
-        "u4\tpurchase\tgalaxy",
-        "u1\tview\tiphone",
-        "u1\tview\tipad",
-        "u1\tview\tnexus",
-        "u1\tview\tgalaxy",
-        "u2\tview\tiphone",
-        "u2\tview\tipad",
-        "u2\tview\tnexus",
-        "u2\tview\tgalaxy",
-        "u3\tview\tsurface",
-        "u3\tview\tnexus",
-        "u4\tview\tiphone",
-        "u4\tview\tipad",
-        "u4\tview\tgalaxy")
+      "u1\tpurchase\tiphone",
+      "u1\tpurchase\tipad",
+      "u2\tpurchase\tnexus",
+      "u2\tpurchase\tgalaxy",
+      "u3\tpurchase\tsurface",
+      "u4\tpurchase\tiphone",
+      "u4\tpurchase\tgalaxy",
+      "u1\tview\tiphone",
+      "u1\tview\tipad",
+      "u1\tview\tnexus",
+      "u1\tview\tgalaxy",
+      "u2\tview\tiphone",
+      "u2\tview\tipad",
+      "u2\tview\tnexus",
+      "u2\tview\tgalaxy",
+      "u3\tview\tsurface",
+      "u3\tview\tnexus",
+      "u4\tview\tiphone",
+      "u4\tview\tipad",
+      "u4\tview\tgalaxy")
 
     // this will create multiple part-xxxxx files in the InFile dir but other 
tests will
     // take account of one actual file
@@ -200,21 +201,21 @@ class ItemSimilarityDriverSuite extends FunSuite with 
DistributedSparkSuite  {
 
     // local multi-threaded Spark with default HDFS
     ItemSimilarityDriver.main(Array(
-        "--input", InFile,
-        "--output", OutPath,
-        "--master", masterUrl,
-        "--filter1", "purchase",
-        "--filter2", "view",
-        "--inDelim", "[,\t]",
-        "--itemIDPosition", "2",
-        "--rowIDPosition", "0",
-        "--filterPosition", "1"))
+      "--input", InFile,
+      "--output", OutPath,
+      "--master", masterUrl,
+      "--filter1", "purchase",
+      "--filter2", "view",
+      "--inDelim", "[,\t]",
+      "--itemIDPosition", "2",
+      "--rowIDPosition", "0",
+      "--filterPosition", "1"))
 
     // todo: a better test would be to get sorted vectors and compare rows 
instead of tokens, this might miss
     // some error cases
-    val indicatorLines = 
mahoutCtx.textFile(OutPath+"/indicator-matrix/").collect.toIterable
+    val indicatorLines = mahoutCtx.textFile(OutPath + 
"/indicator-matrix/").collect.toIterable
     tokenize(indicatorLines) should contain theSameElementsAs 
SelfSimilairtyTokens
-    val crossIndicatorLines = 
mahoutCtx.textFile(OutPath+"/cross-indicator-matrix/").collect.toIterable
+    val crossIndicatorLines = mahoutCtx.textFile(OutPath + 
"/cross-indicator-matrix/").collect.toIterable
     tokenize(crossIndicatorLines) should contain theSameElementsAs 
CrossIndicatorTokens
 
   }
@@ -225,26 +226,26 @@ class ItemSimilarityDriverSuite extends FunSuite with 
DistributedSparkSuite  {
     val OutPath = TmpDir + "indicator-matrices/"
 
     val lines = Array(
-        "2014-06-23 14:46:53.115\tu1\tpurchase\trandom text\tiphone",
-        "2014-06-23 14:46:53.115\tu1\tpurchase\trandom text\tipad",
-        "2014-06-23 14:46:53.115\tu2\tpurchase\trandom text\tnexus",
-        "2014-06-23 14:46:53.115\tu2\tpurchase\trandom text\tgalaxy",
-        "2014-06-23 14:46:53.115\tu3\tpurchase\trandom text\tsurface",
-        "2014-06-23 14:46:53.115\tu4\tpurchase\trandom text\tiphone",
-        "2014-06-23 14:46:53.115\tu4\tpurchase\trandom text\tgalaxy",
-        "2014-06-23 14:46:53.115\tu1\tview\trandom text\tiphone",
-        "2014-06-23 14:46:53.115\tu1\tview\trandom text\tipad",
-        "2014-06-23 14:46:53.115\tu1\tview\trandom text\tnexus",
-        "2014-06-23 14:46:53.115\tu1\tview\trandom text\tgalaxy",
-        "2014-06-23 14:46:53.115\tu2\tview\trandom text\tiphone",
-        "2014-06-23 14:46:53.115\tu2\tview\trandom text\tipad",
-        "2014-06-23 14:46:53.115\tu2\tview\trandom text\tnexus",
-        "2014-06-23 14:46:53.115\tu2\tview\trandom text\tgalaxy",
-        "2014-06-23 14:46:53.115\tu3\tview\trandom text\tsurface",
-        "2014-06-23 14:46:53.115\tu3\tview\trandom text\tnexus",
-        "2014-06-23 14:46:53.115\tu4\tview\trandom text\tiphone",
-        "2014-06-23 14:46:53.115\tu4\tview\trandom text\tipad",
-        "2014-06-23 14:46:53.115\tu4\tview\trandom text\tgalaxy")
+      "2014-06-23 14:46:53.115\tu1\tpurchase\trandom text\tiphone",
+      "2014-06-23 14:46:53.115\tu1\tpurchase\trandom text\tipad",
+      "2014-06-23 14:46:53.115\tu2\tpurchase\trandom text\tnexus",
+      "2014-06-23 14:46:53.115\tu2\tpurchase\trandom text\tgalaxy",
+      "2014-06-23 14:46:53.115\tu3\tpurchase\trandom text\tsurface",
+      "2014-06-23 14:46:53.115\tu4\tpurchase\trandom text\tiphone",
+      "2014-06-23 14:46:53.115\tu4\tpurchase\trandom text\tgalaxy",
+      "2014-06-23 14:46:53.115\tu1\tview\trandom text\tiphone",
+      "2014-06-23 14:46:53.115\tu1\tview\trandom text\tipad",
+      "2014-06-23 14:46:53.115\tu1\tview\trandom text\tnexus",
+      "2014-06-23 14:46:53.115\tu1\tview\trandom text\tgalaxy",
+      "2014-06-23 14:46:53.115\tu2\tview\trandom text\tiphone",
+      "2014-06-23 14:46:53.115\tu2\tview\trandom text\tipad",
+      "2014-06-23 14:46:53.115\tu2\tview\trandom text\tnexus",
+      "2014-06-23 14:46:53.115\tu2\tview\trandom text\tgalaxy",
+      "2014-06-23 14:46:53.115\tu3\tview\trandom text\tsurface",
+      "2014-06-23 14:46:53.115\tu3\tview\trandom text\tnexus",
+      "2014-06-23 14:46:53.115\tu4\tview\trandom text\tiphone",
+      "2014-06-23 14:46:53.115\tu4\tview\trandom text\tipad",
+      "2014-06-23 14:46:53.115\tu4\tview\trandom text\tgalaxy")
 
     // this will create multiple part-xxxxx files in the InFile dir but other 
tests will
     // take account of one actual file
@@ -252,20 +253,20 @@ class ItemSimilarityDriverSuite extends FunSuite with 
DistributedSparkSuite  {
 
     // local multi-threaded Spark with default HDFS
     ItemSimilarityDriver.main(Array(
-        "--input", InFile,
-        "--output", OutPath,
-        "--master", masterUrl,
-        "--filter1", "purchase",
-        "--filter2", "view",
-        "--inDelim", "\t",
-        "--itemIDPosition", "4",
-        "--rowIDPosition", "1",
-        "--filterPosition", "2"))
+      "--input", InFile,
+      "--output", OutPath,
+      "--master", masterUrl,
+      "--filter1", "purchase",
+      "--filter2", "view",
+      "--inDelim", "\t",
+      "--itemIDPosition", "4",
+      "--rowIDPosition", "1",
+      "--filterPosition", "2"))
 
 
-    val indicatorLines = 
mahoutCtx.textFile(OutPath+"/indicator-matrix/").collect.toIterable
+    val indicatorLines = mahoutCtx.textFile(OutPath + 
"/indicator-matrix/").collect.toIterable
     tokenize(indicatorLines) should contain theSameElementsAs 
SelfSimilairtyTokens
-    val crossIndicatorLines = 
mahoutCtx.textFile(OutPath+"/cross-indicator-matrix/").collect.toIterable
+    val crossIndicatorLines = mahoutCtx.textFile(OutPath + 
"/cross-indicator-matrix/").collect.toIterable
     tokenize(crossIndicatorLines) should contain theSameElementsAs 
CrossIndicatorTokens
 
   }
@@ -279,23 +280,23 @@ class ItemSimilarityDriverSuite extends FunSuite with 
DistributedSparkSuite  {
     val OutPath = TmpDir + "indicator-matrices"
 
     val lines = Array(
-        "0,0,1",
-        "0,1,1",
-        "1,2,1",
-        "1,3,1",
-        "2,4,1",
-        "3,0,1",
-        "3,3,1")
+      "0,0,1",
+      "0,1,1",
+      "1,2,1",
+      "1,3,1",
+      "2,4,1",
+      "3,0,1",
+      "3,3,1")
 
     val Answer = tokenize(Iterable(
-        "0\t1:1.7260924347106847",
-        "3\t2:1.7260924347106847",
-        "1\t0:1.7260924347106847",
-        "4",
-        "2\t3:1.7260924347106847"))
+      "0\t1:1.7260924347106847",
+      "3\t2:1.7260924347106847",
+      "1\t0:1.7260924347106847",
+      "4",
+      "2\t3:1.7260924347106847"))
 
     // this creates one part-0000 file in the directory
-    mahoutCtx.parallelize(lines).coalesce(1, 
shuffle=true).saveAsTextFile(InDir)
+    mahoutCtx.parallelize(lines).coalesce(1, shuffle = 
true).saveAsTextFile(InDir)
 
     // to change from using part files to a single .tsv file we'll need to use 
HDFS
     val fs = FileSystem.get(new Configuration())
@@ -304,11 +305,11 @@ class ItemSimilarityDriverSuite extends FunSuite with 
DistributedSparkSuite  {
 
     // local multi-threaded Spark with default HDFS
     ItemSimilarityDriver.main(Array(
-        "--input", InPath,
-        "--output", OutPath,
-        "--master", masterUrl))
+      "--input", InPath,
+      "--output", OutPath,
+      "--master", masterUrl))
 
-    val indicatorLines = 
mahoutCtx.textFile(OutPath+"/indicator-matrix/").collect.toIterable
+    val indicatorLines = mahoutCtx.textFile(OutPath + 
"/indicator-matrix/").collect.toIterable
     tokenize(indicatorLines) should contain theSameElementsAs Answer
 
   }
@@ -322,23 +323,23 @@ class ItemSimilarityDriverSuite extends FunSuite with 
DistributedSparkSuite  {
     val OutPath = TmpDir + "indicator-matrices"
 
     val lines = Array(
-        "0,0,1",
-        "0,1,1",
-        "1,2,1",
-        "1,3,1",
-        "2,4,1",
-        "3,0,1",
-        "3,3,1")
+      "0,0,1",
+      "0,1,1",
+      "1,2,1",
+      "1,3,1",
+      "2,4,1",
+      "3,0,1",
+      "3,3,1")
 
     val Answer = tokenize(Iterable(
-        "0\t1",
-        "3\t2",
-        "1\t0",
-        "4",
-        "2\t3"))
+      "0\t1",
+      "3\t2",
+      "1\t0",
+      "4",
+      "2\t3"))
 
     // this creates one part-0000 file in the directory
-    mahoutCtx.parallelize(lines).coalesce(1, 
shuffle=true).saveAsTextFile(InDir)
+    mahoutCtx.parallelize(lines).coalesce(1, shuffle = 
true).saveAsTextFile(InDir)
 
     // to change from using part files to a single .tsv file we'll need to use 
HDFS
     val fs = FileSystem.get(new Configuration())
@@ -347,12 +348,12 @@ class ItemSimilarityDriverSuite extends FunSuite with 
DistributedSparkSuite  {
 
     // local multi-threaded Spark with default HDFS
     ItemSimilarityDriver.main(Array(
-        "--input", InPath,
-        "--output", OutPath,
-        "--master", masterUrl,
-        "--omitStrength"))
+      "--input", InPath,
+      "--output", OutPath,
+      "--master", masterUrl,
+      "--omitStrength"))
 
-    val indicatorLines = 
mahoutCtx.textFile(OutPath+"/indicator-matrix/").collect.toIterable
+    val indicatorLines = mahoutCtx.textFile(OutPath + 
"/indicator-matrix/").collect.toIterable
     tokenize(indicatorLines) should contain theSameElementsAs Answer
 
   }
@@ -362,28 +363,28 @@ class ItemSimilarityDriverSuite extends FunSuite with 
DistributedSparkSuite  {
     // tmp/data/m1.tsv
     // tmp/data/more-data/another-dir/m2.tsv
     val M1Lines = Array(
-        "u1\tpurchase\tiphone",
-        "u1\tpurchase\tipad",
-        "u2\tpurchase\tnexus",
-        "u2\tpurchase\tgalaxy",
-        "u3\tpurchase\tsurface",
-        "u4\tpurchase\tiphone",
-        "u4\tpurchase\tgalaxy",
-        "u1\tview\tiphone")
+      "u1\tpurchase\tiphone",
+      "u1\tpurchase\tipad",
+      "u2\tpurchase\tnexus",
+      "u2\tpurchase\tgalaxy",
+      "u3\tpurchase\tsurface",
+      "u4\tpurchase\tiphone",
+      "u4\tpurchase\tgalaxy",
+      "u1\tview\tiphone")
 
     val M2Lines = Array(
-        "u1\tview\tipad",
-        "u1\tview\tnexus",
-        "u1\tview\tgalaxy",
-        "u2\tview\tiphone",
-        "u2\tview\tipad",
-        "u2\tview\tnexus",
-        "u2\tview\tgalaxy",
-        "u3\tview\tsurface",
-        "u3\tview\tnexus",
-        "u4\tview\tiphone",
-        "u4\tview\tipad",
-        "u4\tview\tgalaxy")
+      "u1\tview\tipad",
+      "u1\tview\tnexus",
+      "u1\tview\tgalaxy",
+      "u2\tview\tiphone",
+      "u2\tview\tipad",
+      "u2\tview\tnexus",
+      "u2\tview\tgalaxy",
+      "u3\tview\tsurface",
+      "u3\tview\tnexus",
+      "u4\tview\tiphone",
+      "u4\tview\tipad",
+      "u4\tview\tgalaxy")
 
     val InFilenameM1 = "m1.tsv"
     val InDirM1 = TmpDir + "data/"
@@ -396,7 +397,7 @@ class ItemSimilarityDriverSuite extends FunSuite with 
DistributedSparkSuite  {
     val OutPath = TmpDir + "indicator-matrices"
 
     // this creates one part-0000 file in the directory
-    mahoutCtx.parallelize(M1Lines).coalesce(1, 
shuffle=true).saveAsTextFile(InDirM1)
+    mahoutCtx.parallelize(M1Lines).coalesce(1, shuffle = 
true).saveAsTextFile(InDirM1)
 
     // to change from using part files to a single .tsv file we'll need to use 
HDFS
     val fs = FileSystem.get(new Configuration())
@@ -404,7 +405,7 @@ class ItemSimilarityDriverSuite extends FunSuite with 
DistributedSparkSuite  {
     fs.rename(new Path(InDirM1 + "part-00000"), new Path(InPathM1))
 
     // this creates one part-0000 file in the directory
-    mahoutCtx.parallelize(M2Lines).coalesce(1, 
shuffle=true).saveAsTextFile(InDirM2)
+    mahoutCtx.parallelize(M2Lines).coalesce(1, shuffle = 
true).saveAsTextFile(InDirM2)
 
     // to change from using part files to a single .tsv file we'll need to use 
HDFS
     //rename part-00000 to tmp/some-location/something.tsv
@@ -413,52 +414,52 @@ class ItemSimilarityDriverSuite extends FunSuite with 
DistributedSparkSuite  {
     // local multi-threaded Spark with default FS, suitable for build tests 
but need better location for data
 
     ItemSimilarityDriver.main(Array(
-        "--input", InPathStart,
-        "--output", OutPath,
-        "--master", masterUrl,
-        "--filter1", "purchase",
-        "--filter2", "view",
-        "--inDelim", "\t",
-        "--itemIDPosition", "2",
-        "--rowIDPosition", "0",
-        "--filterPosition", "1",
-        "--filenamePattern", "m..tsv",
-        "--recursive"))
+      "--input", InPathStart,
+      "--output", OutPath,
+      "--master", masterUrl,
+      "--filter1", "purchase",
+      "--filter2", "view",
+      "--inDelim", "\t",
+      "--itemIDPosition", "2",
+      "--rowIDPosition", "0",
+      "--filterPosition", "1",
+      "--filenamePattern", "m..tsv",
+      "--recursive"))
 
-    val indicatorLines = 
mahoutCtx.textFile(OutPath+"/indicator-matrix/").collect.toIterable
+    val indicatorLines = mahoutCtx.textFile(OutPath + 
"/indicator-matrix/").collect.toIterable
     tokenize(indicatorLines) should contain theSameElementsAs 
SelfSimilairtyTokens
-    val crossIndicatorLines = 
mahoutCtx.textFile(OutPath+"/cross-indicator-matrix/").collect.toIterable
+    val crossIndicatorLines = mahoutCtx.textFile(OutPath + 
"/cross-indicator-matrix/").collect.toIterable
     tokenize(crossIndicatorLines) should contain theSameElementsAs 
CrossIndicatorTokens
 
   }
 
-  test ("ItemSimilarityDriver, two input paths"){
+  test("ItemSimilarityDriver, two input paths") {
 
     val InFile1 = TmpDir + "in-file1.csv/" //using part files, not single file
     val InFile2 = TmpDir + "in-file2.csv/" //using part files, not single file
     val OutPath = TmpDir + "indicator-matrices/"
 
     val lines = Array(
-        "u1,purchase,iphone",
-        "u1,purchase,ipad",
-        "u2,purchase,nexus",
-        "u2,purchase,galaxy",
-        "u3,purchase,surface",
-        "u4,purchase,iphone",
-        "u4,purchase,galaxy",
-        "u1,view,iphone",
-        "u1,view,ipad",
-        "u1,view,nexus",
-        "u1,view,galaxy",
-        "u2,view,iphone",
-        "u2,view,ipad",
-        "u2,view,nexus",
-        "u2,view,galaxy",
-        "u3,view,surface",
-        "u3,view,nexus",
-        "u4,view,iphone",
-        "u4,view,ipad",
-        "u4,view,galaxy")
+      "u1,purchase,iphone",
+      "u1,purchase,ipad",
+      "u2,purchase,nexus",
+      "u2,purchase,galaxy",
+      "u3,purchase,surface",
+      "u4,purchase,iphone",
+      "u4,purchase,galaxy",
+      "u1,view,iphone",
+      "u1,view,ipad",
+      "u1,view,nexus",
+      "u1,view,galaxy",
+      "u2,view,iphone",
+      "u2,view,ipad",
+      "u2,view,nexus",
+      "u2,view,galaxy",
+      "u3,view,surface",
+      "u3,view,nexus",
+      "u4,view,iphone",
+      "u4,view,ipad",
+      "u4,view,galaxy")
 
     // this will create multiple part-xxxxx files in the InFile dir but other 
tests will
     // take account of one actual file
@@ -467,65 +468,65 @@ class ItemSimilarityDriverSuite extends FunSuite with 
DistributedSparkSuite  {
 
     // local multi-threaded Spark with default HDFS
     ItemSimilarityDriver.main(Array(
-        "--input", InFile1,
-        "--input2", InFile2,
-        "--output", OutPath,
-        "--master", masterUrl,
-        "--filter1", "purchase",
-        "--filter2", "view",
-        "--inDelim", ",",
-        "--itemIDPosition", "2",
-        "--rowIDPosition", "0",
-        "--filterPosition", "1"))
+      "--input", InFile1,
+      "--input2", InFile2,
+      "--output", OutPath,
+      "--master", masterUrl,
+      "--filter1", "purchase",
+      "--filter2", "view",
+      "--inDelim", ",",
+      "--itemIDPosition", "2",
+      "--rowIDPosition", "0",
+      "--filterPosition", "1"))
 
-    val indicatorLines = 
mahoutCtx.textFile(OutPath+"/indicator-matrix/").collect.toIterable
+    val indicatorLines = mahoutCtx.textFile(OutPath + 
"/indicator-matrix/").collect.toIterable
     tokenize(indicatorLines) should contain theSameElementsAs 
SelfSimilairtyTokens
-    val crossIndicatorLines = 
mahoutCtx.textFile(OutPath+"/cross-indicator-matrix/").collect.toIterable
+    val crossIndicatorLines = mahoutCtx.textFile(OutPath + 
"/cross-indicator-matrix/").collect.toIterable
     tokenize(crossIndicatorLines) should contain theSameElementsAs 
CrossIndicatorTokens
 
   }
 
-  test ("ItemSimilarityDriver, two inputs of different dimensions"){
+  test("ItemSimilarityDriver, two inputs of different dimensions") {
 
     val InFile1 = TmpDir + "in-file1.csv/" //using part files, not single file
     val InFile2 = TmpDir + "in-file2.csv/" //using part files, not single file
     val OutPath = TmpDir + "indicator-matrices/"
 
     val lines = Array(
-        "u1,purchase,iphone",
-        "u1,purchase,ipad",
-        "u2,purchase,nexus",
-        "u2,purchase,galaxy",
-        // remove one user so A'B will be of different dimensions
-        // ItemSimilarityDriver should create one unified user dictionary and 
so account for this
-        // discrepancy as a blank row: "u3,purchase,surface",
-        "u4,purchase,iphone",
-        "u4,purchase,galaxy",
-        "u1,view,iphone",
-        "u1,view,ipad",
-        "u1,view,nexus",
-        "u1,view,galaxy",
-        "u2,view,iphone",
-        "u2,view,ipad",
-        "u2,view,nexus",
-        "u2,view,galaxy",
-        "u3,view,surface",
-        "u3,view,nexus",
-        "u4,view,iphone",
-        "u4,view,ipad",
-        "u4,view,galaxy")
+      "u1,purchase,iphone",
+      "u1,purchase,ipad",
+      "u2,purchase,nexus",
+      "u2,purchase,galaxy",
+      // remove one user so A'B will be of different dimensions
+      // ItemSimilarityDriver should create one unified user dictionary and so 
account for this
+      // discrepancy as a blank row: "u3,purchase,surface",
+      "u4,purchase,iphone",
+      "u4,purchase,galaxy",
+      "u1,view,iphone",
+      "u1,view,ipad",
+      "u1,view,nexus",
+      "u1,view,galaxy",
+      "u2,view,iphone",
+      "u2,view,ipad",
+      "u2,view,nexus",
+      "u2,view,galaxy",
+      "u3,view,surface",
+      "u3,view,nexus",
+      "u4,view,iphone",
+      "u4,view,ipad",
+      "u4,view,galaxy")
 
     val UnequalDimensionsSelfSimilarity = tokenize(Iterable(
-        "ipad\tiphone:1.7260924347106847",
-        "iphone\tipad:1.7260924347106847",
-        "nexus\tgalaxy:1.7260924347106847",
-        "galaxy\tnexus:1.7260924347106847"))
+      "ipad\tiphone:1.7260924347106847",
+      "iphone\tipad:1.7260924347106847",
+      "nexus\tgalaxy:1.7260924347106847",
+      "galaxy\tnexus:1.7260924347106847"))
 
     val UnequalDimensionsCrossSimilarity = tokenize(Iterable(
-        "galaxy\tgalaxy:1.7260924347106847 iphone:1.7260924347106847 
ipad:1.7260924347106847 nexus:1.7260924347106847",
-        "iphone\tgalaxy:1.7260924347106847 iphone:1.7260924347106847 
ipad:1.7260924347106847 surface:1.7260924347106847 nexus:1.7260924347106847",
-        "ipad\tgalaxy:0.6795961471815897 iphone:0.6795961471815897 
ipad:0.6795961471815897 nexus:0.6795961471815897",
-        "nexus\tiphone:0.6795961471815897 ipad:0.6795961471815897 
nexus:0.6795961471815897 galaxy:0.6795961471815897"))
+      "galaxy\tgalaxy:1.7260924347106847 iphone:1.7260924347106847 
ipad:1.7260924347106847 nexus:1.7260924347106847",
+      "iphone\tgalaxy:1.7260924347106847 iphone:1.7260924347106847 
ipad:1.7260924347106847 surface:1.7260924347106847 nexus:1.7260924347106847",
+      "ipad\tgalaxy:0.6795961471815897 iphone:0.6795961471815897 
ipad:0.6795961471815897 nexus:0.6795961471815897",
+      "nexus\tiphone:0.6795961471815897 ipad:0.6795961471815897 
nexus:0.6795961471815897 galaxy:0.6795961471815897"))
     // this will create multiple part-xxxxx files in the InFile dir but other 
tests will
     // take account of one actual file
     val linesRdd1 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile1)
@@ -533,25 +534,25 @@ class ItemSimilarityDriverSuite extends FunSuite with 
DistributedSparkSuite  {
 
     // local multi-threaded Spark with default HDFS
     ItemSimilarityDriver.main(Array(
-        "--input", InFile1,
-        "--input2", InFile2,
-        "--output", OutPath,
-        "--master", masterUrl,
-        "--filter1", "purchase",
-        "--filter2", "view",
-        "--inDelim", ",",
-        "--itemIDPosition", "2",
-        "--rowIDPosition", "0",
-        "--filterPosition", "1"))
+      "--input", InFile1,
+      "--input2", InFile2,
+      "--output", OutPath,
+      "--master", masterUrl,
+      "--filter1", "purchase",
+      "--filter2", "view",
+      "--inDelim", ",",
+      "--itemIDPosition", "2",
+      "--rowIDPosition", "0",
+      "--filterPosition", "1"))
 
-    val indicatorLines = 
mahoutCtx.textFile(OutPath+"/indicator-matrix/").collect.toIterable
-    val crossIndicatorLines = 
mahoutCtx.textFile(OutPath+"/cross-indicator-matrix/").collect.toIterable
+    val indicatorLines = mahoutCtx.textFile(OutPath + 
"/indicator-matrix/").collect.toIterable
+    val crossIndicatorLines = mahoutCtx.textFile(OutPath + 
"/cross-indicator-matrix/").collect.toIterable
     tokenize(indicatorLines) should contain theSameElementsAs 
UnequalDimensionsSelfSimilarity
     tokenize(crossIndicatorLines) should contain theSameElementsAs 
UnequalDimensionsCrossSimilarity
 
   }
 
-  test("ItemSimilarityDriver cross similarity two separate items spaces"){
+  test("ItemSimilarityDriver cross similarity two separate items spaces") {
     /* cross-similarity with category views, same user space
                phones  tablets mobile_acc      soap
           u1   0             1       1           0
@@ -564,29 +565,29 @@ class ItemSimilarityDriverSuite extends FunSuite with 
DistributedSparkSuite  {
     val OutPath = TmpDir + "indicator-matrices/"
 
     val lines = Array(
-        "u1,purchase,iphone",
-        "u1,purchase,ipad",
-        "u2,purchase,nexus",
-        "u2,purchase,galaxy",
-        "u3,purchase,surface",
-        "u4,purchase,iphone",
-        "u4,purchase,galaxy",
-        "u1,view,phones",
-        "u1,view,mobile_acc",
-        "u2,view,phones",
-        "u2,view,tablets",
-        "u2,view,mobile_acc",
-        "u3,view,mobile_acc",
-        "u4,view,phones",
-        "u4,view,tablets",
-        "u4,view,soap")
+      "u1,purchase,iphone",
+      "u1,purchase,ipad",
+      "u2,purchase,nexus",
+      "u2,purchase,galaxy",
+      "u3,purchase,surface",
+      "u4,purchase,iphone",
+      "u4,purchase,galaxy",
+      "u1,view,phones",
+      "u1,view,mobile_acc",
+      "u2,view,phones",
+      "u2,view,tablets",
+      "u2,view,mobile_acc",
+      "u3,view,mobile_acc",
+      "u4,view,phones",
+      "u4,view,tablets",
+      "u4,view,soap")
 
     val UnequalDimensionsCrossSimilarityLines = tokenize(Iterable(
-        "iphone\tmobile_acc:1.7260924347106847 soap:1.7260924347106847 
phones:1.7260924347106847",
-        "surface\tmobile_acc:0.6795961471815897",
-        "nexus\ttablets:1.7260924347106847 mobile_acc:0.6795961471815897 
phones:0.6795961471815897",
-        "galaxy\ttablets:5.545177444479561 soap:1.7260924347106847 
phones:1.7260924347106847 mobile_acc:1.7260924347106847",
-        "ipad\tmobile_acc:0.6795961471815897 phones:0.6795961471815897"))
+      "iphone\tmobile_acc:1.7260924347106847 soap:1.7260924347106847 
phones:1.7260924347106847",
+      "surface\tmobile_acc:0.6795961471815897",
+      "nexus\ttablets:1.7260924347106847 mobile_acc:0.6795961471815897 
phones:0.6795961471815897",
+      "galaxy\ttablets:5.545177444479561 soap:1.7260924347106847 
phones:1.7260924347106847 mobile_acc:1.7260924347106847",
+      "ipad\tmobile_acc:0.6795961471815897 phones:0.6795961471815897"))
 
     // this will create multiple part-xxxxx files in the InFile dir but other 
tests will
     // take account of one actual file
@@ -595,39 +596,39 @@ class ItemSimilarityDriverSuite extends FunSuite with 
DistributedSparkSuite  {
 
     // local multi-threaded Spark with default HDFS
     ItemSimilarityDriver.main(Array(
-        "--input", InFile1,
-        "--input2", InFile2,
-        "--output", OutPath,
-        "--master", masterUrl,
-        "--filter1", "purchase",
-        "--filter2", "view",
-        "--inDelim", ",",
-        "--itemIDPosition", "2",
-        "--rowIDPosition", "0",
-        "--filterPosition", "1",
-        "--writeAllDatasets"))
+      "--input", InFile1,
+      "--input2", InFile2,
+      "--output", OutPath,
+      "--master", masterUrl,
+      "--filter1", "purchase",
+      "--filter2", "view",
+      "--inDelim", ",",
+      "--itemIDPosition", "2",
+      "--rowIDPosition", "0",
+      "--filterPosition", "1",
+      "--writeAllDatasets"))
 
-    val indicatorLines = 
mahoutCtx.textFile(OutPath+"/indicator-matrix/").collect.toIterable
-    val crossIndicatorLines = 
mahoutCtx.textFile(OutPath+"/cross-indicator-matrix/").collect.toIterable
+    val indicatorLines = mahoutCtx.textFile(OutPath + 
"/indicator-matrix/").collect.toIterable
+    val crossIndicatorLines = mahoutCtx.textFile(OutPath + 
"/cross-indicator-matrix/").collect.toIterable
     tokenize(indicatorLines) should contain theSameElementsAs 
SelfSimilairtyTokens
     tokenize(crossIndicatorLines) should contain theSameElementsAs 
UnequalDimensionsCrossSimilarityLines
 
   }
 
-  test("A.t %*% B after changing row cardinality of A"){
+  test("A.t %*% B after changing row cardinality of A") {
     // todo: move to math tests but this is Spark specific
 
     val a = dense(
-        (1.0, 1.0))
+      (1.0, 1.0))
 
     val b = dense(
-        (1.0, 1.0),
-        (1.0, 1.0),
-        (1.0, 1.0))
+      (1.0, 1.0),
+      (1.0, 1.0),
+      (1.0, 1.0))
 
     val inCoreABiggertBAnswer = dense(
-        (1.0, 1.0),
-        (1.0, 1.0))
+      (1.0, 1.0),
+      (1.0, 1.0))
 
     val drmA = drmParallelize(m = a, numPartitions = 2)
     val drmB = drmParallelize(m = b, numPartitions = 2)
@@ -645,7 +646,7 @@ class ItemSimilarityDriverSuite extends FunSuite with 
DistributedSparkSuite  {
     val bp = 0
   }
 
-  test("ItemSimilarityDriver cross similarity two separate items spaces, 
missing rows in B"){
+  test("ItemSimilarityDriver cross similarity two separate items spaces, 
missing rows in B") {
     /* cross-similarity with category views, same user space
                phones  tablets mobile_acc      soap
             u1 0             1       1           0
@@ -658,29 +659,29 @@ removed ==> u3    0             0       1           0
     val OutPath = TmpDir + "indicator-matrices/"
 
     val lines = Array(
-        "u1,purchase,iphone",
-        "u1,purchase,ipad",
-        "u2,purchase,nexus",
-        "u2,purchase,galaxy",
-        "u3,purchase,surface",
-        "u4,purchase,iphone",
-        "u4,purchase,galaxy",
-        "u1,view,phones",
-        "u1,view,mobile_acc",
-        "u2,view,phones",
-        "u2,view,tablets",
-        "u2,view,mobile_acc",
-        //"u3,view,mobile_acc",// if this line is removed the 
cross-cooccurrence should work
-        "u4,view,phones",
-        "u4,view,tablets",
-        "u4,view,soap")
+      "u1,purchase,iphone",
+      "u1,purchase,ipad",
+      "u2,purchase,nexus",
+      "u2,purchase,galaxy",
+      "u3,purchase,surface",
+      "u4,purchase,iphone",
+      "u4,purchase,galaxy",
+      "u1,view,phones",
+      "u1,view,mobile_acc",
+      "u2,view,phones",
+      "u2,view,tablets",
+      "u2,view,mobile_acc",
+      //"u3,view,mobile_acc",// if this line is removed the cross-cooccurrence 
should work
+      "u4,view,phones",
+      "u4,view,tablets",
+      "u4,view,soap")
 
     val UnequalDimensionsCrossSimilarityLines = tokenize(Iterable(
-        "galaxy\ttablets:5.545177444479561 soap:1.7260924347106847 
phones:1.7260924347106847",
-        "ipad\tmobile_acc:1.7260924347106847 phones:0.6795961471815897",
-        "surface",
-        "nexus\tmobile_acc:1.7260924347106847 tablets:1.7260924347106847 
phones:0.6795961471815897",
-        "iphone\tsoap:1.7260924347106847 phones:1.7260924347106847"))
+      "galaxy\ttablets:5.545177444479561 soap:1.7260924347106847 
phones:1.7260924347106847",
+      "ipad\tmobile_acc:1.7260924347106847 phones:0.6795961471815897",
+      "surface",
+      "nexus\tmobile_acc:1.7260924347106847 tablets:1.7260924347106847 
phones:0.6795961471815897",
+      "iphone\tsoap:1.7260924347106847 phones:1.7260924347106847"))
 
     // this will create multiple part-xxxxx files in the InFile dir but other 
tests will
     // take account of one actual file
@@ -689,20 +690,20 @@ removed ==> u3    0             0       1           0
 
     // local multi-threaded Spark with default HDFS
     ItemSimilarityDriver.main(Array(
-        "--input", InFile1,
-        "--input2", InFile2,
-        "--output", OutPath,
-        "--master", masterUrl,
-        "--filter1", "purchase",
-        "--filter2", "view",
-        "--inDelim", ",",
-        "--itemIDPosition", "2",
-        "--rowIDPosition", "0",
-        "--filterPosition", "1",
-        "--writeAllDatasets"))
+      "--input", InFile1,
+      "--input2", InFile2,
+      "--output", OutPath,
+      "--master", masterUrl,
+      "--filter1", "purchase",
+      "--filter2", "view",
+      "--inDelim", ",",
+      "--itemIDPosition", "2",
+      "--rowIDPosition", "0",
+      "--filterPosition", "1",
+      "--writeAllDatasets"))
 
-    val indicatorLines = 
mahoutCtx.textFile(OutPath+"/indicator-matrix/").collect.toIterable
-    val crossIndicatorLines = 
mahoutCtx.textFile(OutPath+"/cross-indicator-matrix/").collect.toIterable
+    val indicatorLines = mahoutCtx.textFile(OutPath + 
"/indicator-matrix/").collect.toIterable
+    val crossIndicatorLines = mahoutCtx.textFile(OutPath + 
"/cross-indicator-matrix/").collect.toIterable
     tokenize(indicatorLines) should contain theSameElementsAs 
SelfSimilairtyTokens
     tokenize(crossIndicatorLines) should contain theSameElementsAs 
UnequalDimensionsCrossSimilarityLines
   }
@@ -711,7 +712,7 @@ removed ==> u3      0             0       1           0
   def tokenize(a: Iterable[String]): Iterable[String] = {
     var r: Iterable[String] = Iterable()
     a.foreach { l =>
-      l.split("\t").foreach{ s =>
+      l.split("\t").foreach { s =>
         r = r ++ s.split("[\t ]")
       }
     }
@@ -722,5 +723,5 @@ removed ==> u3      0             0       1           0
     super.beforeAll(configMap)
     ItemSimilarityDriver.useContext(mahoutCtx)
   }
-  
+
 }

http://git-wip-us.apache.org/repos/asf/mahout/blob/149c9859/spark/src/test/scala/org/apache/mahout/drivers/RowSimilarityDriverSuite.scala
----------------------------------------------------------------------
diff --git 
a/spark/src/test/scala/org/apache/mahout/drivers/RowSimilarityDriverSuite.scala 
b/spark/src/test/scala/org/apache/mahout/drivers/RowSimilarityDriverSuite.scala
new file mode 100644
index 0000000..562e8c6
--- /dev/null
+++ 
b/spark/src/test/scala/org/apache/mahout/drivers/RowSimilarityDriverSuite.scala
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.drivers
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.mahout.math.drm.RLikeDrmOps._
+import org.apache.mahout.math.drm._
+import org.apache.mahout.math.scalabindings.RLikeOps._
+import org.apache.mahout.math.scalabindings._
+import org.apache.mahout.sparkbindings._
+import org.apache.mahout.sparkbindings.test.DistributedSparkSuite
+import org.scalatest.{ConfigMap, FunSuite}
+
+
+class RowSimilarityDriverSuite extends FunSuite with DistributedSparkSuite  {
+
+  val TextDocs = Array(
+    "doc1\tNow is the time for all good people to come to aid of their party",
+    "doc2\tNow is the time for all good people to come to aid of their 
country",
+    "doc3\tNow is the time for all good people to come to aid of their hood",
+    "doc4\tNow is the time for all good people to come to aid of their 
friends",
+    "doc5\tNow is the time for all good people to come to aid of their looser 
brother",
+    "doc6\tThe quick brown fox jumped over the lazy dog",
+    "doc7\tThe quick brown fox jumped over the lazy boy",
+    "doc8\tThe quick brown fox jumped over the lazy cat",
+    "doc9\tThe quick brown fox jumped over the lazy wolverine",
+    "doc10\tThe quick brown fox jumped over the lazy cantelope")// yes that's 
spelled correctly.
+
+  test("RowSimilarityDriver text docs no strengths") {
+
+    val firstFiveSimDocsTokens = tokenize(Iterable(
+      "doc1\tdoc3 doc2 doc4 doc5"))
+
+    val lastFiveSimDocsTokens = tokenize(Iterable(
+      "doc6\tdoc8 doc10 doc7 doc9"))
+
+    val inDir = TmpDir + "in-dir/"
+    val inFilename = "in-file.tsv"
+    val inPath = inDir + inFilename
+
+    val outPath = TmpDir + "similarity-matrices/"
+
+
+    // this creates one part-0000 file in the directory
+    mahoutCtx.parallelize(TextDocs).coalesce(1, 
shuffle=true).saveAsTextFile(inDir)
+
+    // to change from using part files to a single .tsv file we'll need to use 
HDFS
+    val fs = FileSystem.get(new Configuration())
+    //rename part-00000 to something.tsv
+    fs.rename(new Path(inDir + "part-00000"), new Path(inPath))
+
+    // local multi-threaded Spark with default HDFS
+    RowSimilarityDriver.main(Array(
+      "--input", inPath,
+      "--output", outPath,
+      "--omitStrength",
+      "--maxSimilaritiesPerRow", "4", // would get all docs similar if we 
didn't limit them
+      "--master", masterUrl))
+
+    val simLines = mahoutCtx.textFile(outPath).collect
+    for (rowNum <- 0 to 4){
+      simLines(rowNum).split("[\t ]") should contain theSameElementsAs 
firstFiveSimDocsTokens
+    }
+    for (rowNum <- 5 to 9){
+      simLines(rowNum).split("[\t ]") should contain theSameElementsAs 
lastFiveSimDocsTokens
+    }
+
+  }
+
+  test("RowSimilarityDriver text docs") {
+
+    val simDocsTokens = tokenize(Iterable(
+      "doc1\tdoc3:27.87301122947484 doc2:27.87301122947484 
doc4:27.87301122947484 doc5:23.42278065550721",
+      "doc2\tdoc4:27.87301122947484 doc3:27.87301122947484 
doc1:27.87301122947484 doc5:23.42278065550721",
+      "doc3\tdoc4:27.87301122947484 doc2:27.87301122947484 
doc1:27.87301122947484 doc5:23.42278065550721",
+      "doc4\tdoc3:27.87301122947484 doc2:27.87301122947484 
doc1:27.87301122947484 doc5:23.42278065550721",
+      "doc5\tdoc4:23.42278065550721 doc2:23.42278065550721 
doc3:23.42278065550721 doc1:23.42278065550721",
+      "doc6\tdoc8:22.936393049704463 doc10:22.936393049704463 
doc7:22.936393049704463 doc9:22.936393049704463",
+      "doc7\tdoc6:22.936393049704463 doc8:22.936393049704463 
doc10:22.936393049704463 doc9:22.936393049704463",
+      "doc8\tdoc6:22.936393049704463 doc10:22.936393049704463 
doc7:22.936393049704463 doc9:22.936393049704463",
+      "doc9\tdoc6:22.936393049704463 doc8:22.936393049704463 
doc10:22.936393049704463 doc7:22.936393049704463",
+      "doc10\tdoc6:22.936393049704463 doc8:22.936393049704463 
doc7:22.936393049704463 doc9:22.936393049704463"))
+
+    val inDir = TmpDir + "in-dir/"
+    val inFilename = "in-file.tsv"
+    val inPath = inDir + inFilename
+
+    val outPath = TmpDir + "similarity-matrix/"
+
+
+    // this creates one part-0000 file in the directory
+    mahoutCtx.parallelize(TextDocs).coalesce(1, 
shuffle=true).saveAsTextFile(inDir)
+
+    // to change from using part files to a single .tsv file we'll need to use 
HDFS
+    val fs = FileSystem.get(new Configuration())
+    //rename part-00000 to something.tsv
+    fs.rename(new Path(inDir + "part-00000"), new Path(inPath))
+
+    // local multi-threaded Spark with default HDFS
+    RowSimilarityDriver.main(Array(
+      "--input", inPath,
+      "--output", outPath,
+      "--maxSimilaritiesPerRow", "4", // would get all docs similar if we 
didn't limit them
+      "--master", masterUrl))
+
+    val simLines = mahoutCtx.textFile(outPath).collect
+    tokenize(simLines) should contain theSameElementsAs simDocsTokens
+  }
+
+  // convert into an Iterable of tokens for 'should contain theSameElementsAs 
Iterable'
+  def tokenize(a: Iterable[String], splitString: String = "[\t ]"): 
Iterable[String] = {
+    var r: Iterable[String] = Iterable()
+    a.foreach ( l => r = r ++ l.split(splitString) )
+    r
+  }
+
+  override protected def beforeAll(configMap: ConfigMap) {
+    super.beforeAll(configMap)
+    RowSimilarityDriver.useContext(mahoutCtx)
+  }
+
+}

[1/2] MAHOUT-1604 add a CLI and associated code for spark-rowsimilarity, also cleans up some things in MAHOUT-1568 and MAHOUT-1569, closes apache/mahout#47

Reply via email to