http://git-wip-us.apache.org/repos/asf/mahout/blob/034790cc/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala ---------------------------------------------------------------------- diff --git a/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala b/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala index 628d981..fc84577 100644 --- a/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala +++ b/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala @@ -1,832 +1,832 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.drivers - -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{Path, FileSystem} -import org.apache.mahout.math.indexeddataset.{BiDictionary, IndexedDataset} -import org.apache.mahout.sparkbindings.indexeddataset.IndexedDatasetSpark -import org.scalatest.{ConfigMap, FunSuite} -import org.apache.mahout.sparkbindings._ -import org.apache.mahout.sparkbindings.test.DistributedSparkSuite -import org.apache.mahout.math.drm._ -import org.apache.mahout.math.scalabindings._ - -import scala.collection.immutable.HashMap - -//todo: take out, only for temp tests - -import org.apache.mahout.math.scalabindings._ -import RLikeOps._ -import org.apache.mahout.math.drm._ -import RLikeDrmOps._ -import scala.collection.JavaConversions._ - - -class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite { - - /* - final val matrixLLRCoocAtAControl = dense( - (0.0, 0.6331745808516107, 0.0, 0.0, 0.0), - (0.6331745808516107, 0.0, 0.0, 0.0, 0.0), - (0.0, 0.0, 0.0, 0.6331745808516107, 0.0), - (0.0, 0.0, 0.6331745808516107, 0.0, 0.0), - (0.0, 0.0, 0.0, 0.0, 0.0)) - - final val matrixLLRCoocBtAControl = dense( - (1.7260924347106847, 1.7260924347106847, 1.7260924347106847, 1.7260924347106847, 0.0), - (0.6795961471815897, 0.6795961471815897, 0.6795961471815897, 0.6795961471815897, 0.0), - (0.6795961471815897, 0.6795961471815897, 0.6795961471815897, 0.6795961471815897, 0.0), - (1.7260924347106847, 1.7260924347106847, 1.7260924347106847, 1.7260924347106847, 0.0), - (0.0, 0.0, 0.6795961471815897, 0.0, 4.498681156950466)) - */ - - - final val SelfSimilairtyLines = Iterable( - "galaxy\tnexus:1.7260924347106847", - "ipad\tiphone:1.7260924347106847", - "nexus\tgalaxy:1.7260924347106847", - "iphone\tipad:1.7260924347106847", - "surface") - - val CrossSimilarityLines = Iterable( - "iphone\tnexus:1.7260924347106847 iphone:1.7260924347106847 ipad:1.7260924347106847 galaxy:1.7260924347106847", - "ipad\tnexus:0.6795961471815897 iphone:0.6795961471815897 ipad:0.6795961471815897 galaxy:0.6795961471815897", - "nexus\tnexus:0.6795961471815897 iphone:0.6795961471815897 ipad:0.6795961471815897 galaxy:0.6795961471815897", - "galaxy\tnexus:1.7260924347106847 iphone:1.7260924347106847 ipad:1.7260924347106847 galaxy:1.7260924347106847", - "surface\tsurface:4.498681156950466 nexus:0.6795961471815897") - - // todo: a better test would be to sort each vector by itemID and compare rows, tokens misses some error cases - final val SelfSimilairtyTokens = tokenize(Iterable( - "galaxy\tnexus:1.7260924347106847", - "ipad\tiphone:1.7260924347106847", - "nexus\tgalaxy:1.7260924347106847", - "iphone\tipad:1.7260924347106847", - "surface")) - - val CrossSimilarityTokens = tokenize(Iterable( - "iphone\tnexus:1.7260924347106847 iphone:1.7260924347106847 ipad:1.7260924347106847 galaxy:1.7260924347106847", - "ipad\tnexus:0.6795961471815897 iphone:0.6795961471815897 ipad:0.6795961471815897 galaxy:0.6795961471815897", - "nexus\tnexus:0.6795961471815897 iphone:0.6795961471815897 ipad:0.6795961471815897 galaxy:0.6795961471815897", - "galaxy\tnexus:1.7260924347106847 iphone:1.7260924347106847 ipad:1.7260924347106847 galaxy:1.7260924347106847", - "surface\tsurface:4.498681156950466 nexus:0.6795961471815897")) - - /* - //Clustered Spark and HDFS, not a good everyday build test - ItemSimilarityDriver.main(Array( - "--input", "hdfs://occam4:54310/user/pat/spark-itemsimilarity/cf-data.txt", - "--output", "hdfs://occam4:54310/user/pat/spark-itemsimilarity/similarityMatrices/", - "--master", "spark://occam4:7077", - "--filter1", "purchase", - "--filter2", "view", - "--inDelim", ",", - "--itemIDColumn", "2", - "--rowIDColumn", "0", - "--filterColumn", "1")) - */ - // local multi-threaded Spark with HDFS using large dataset - // not a good build test. - /* - ItemSimilarityDriver.main(Array( - "--input", "hdfs://occam4:54310/user/pat/xrsj/ratings_data.txt", - "--output", "hdfs://occam4:54310/user/pat/xrsj/similarityMatrices/", - "--master", "local[4]", - "--filter1", "purchase", - "--filter2", "view", - "--inDelim", ",", - "--itemIDColumn", "2", - "--rowIDColumn", "0", - "--filterColumn", "1")) - */ - - test("ItemSimilarityDriver, non-full-spec CSV") { - - val InFile = TmpDir + "in-file.csv/" //using part files, not single file - val OutPath = TmpDir + "similarity-matrices/" - - val lines = Array( - "u1,purchase,iphone", - "u1,purchase,ipad", - "u2,purchase,nexus", - "u2,purchase,galaxy", - "u3,purchase,surface", - "u4,purchase,iphone", - "u4,purchase,galaxy", - "u1,view,iphone", - "u1,view,ipad", - "u1,view,nexus", - "u1,view,galaxy", - "u2,view,iphone", - "u2,view,ipad", - "u2,view,nexus", - "u2,view,galaxy", - "u3,view,surface", - "u3,view,nexus", - "u4,view,iphone", - "u4,view,ipad", - "u4,view,galaxy") - - // this will create multiple part-xxxxx files in the InFile dir but other tests will - // take account of one actual file - val linesRdd = mahoutCtx.parallelize(lines).saveAsTextFile(InFile) - - // local multi-threaded Spark with default HDFS - ItemSimilarityDriver.main(Array( - "--input", InFile, - "--output", OutPath, - "--master", masterUrl, - "--filter1", "purchase", - "--filter2", "view", - "--inDelim", ",", - "--itemIDColumn", "2", - "--rowIDColumn", "0", - "--filterColumn", "1", - "--writeAllDatasets")) - - // todo: these comparisons rely on a sort producing the same lines, which could possibly - // fail since the sort is on value and these can be the same for all items in a vector - val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable - tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens - val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable - tokenize(crossSimilarityLines) should contain theSameElementsAs CrossSimilarityTokens - } - - - - test("ItemSimilarityDriver TSV ") { - - val InFile = TmpDir + "in-file.tsv/" - val OutPath = TmpDir + "similarity-matrices/" - - val lines = Array( - "u1\tpurchase\tiphone", - "u1\tpurchase\tipad", - "u2\tpurchase\tnexus", - "u2\tpurchase\tgalaxy", - "u3\tpurchase\tsurface", - "u4\tpurchase\tiphone", - "u4\tpurchase\tgalaxy", - "u1\tview\tiphone", - "u1\tview\tipad", - "u1\tview\tnexus", - "u1\tview\tgalaxy", - "u2\tview\tiphone", - "u2\tview\tipad", - "u2\tview\tnexus", - "u2\tview\tgalaxy", - "u3\tview\tsurface", - "u3\tview\tnexus", - "u4\tview\tiphone", - "u4\tview\tipad", - "u4\tview\tgalaxy") - - // this will create multiple part-xxxxx files in the InFile dir but other tests will - // take account of one actual file - val linesRdd = mahoutCtx.parallelize(lines).saveAsTextFile(InFile) - - // local multi-threaded Spark with default HDFS - ItemSimilarityDriver.main(Array( - "--input", InFile, - "--output", OutPath, - "--master", masterUrl, - "--filter1", "purchase", - "--filter2", "view", - "--inDelim", "[,\t]", - "--itemIDColumn", "2", - "--rowIDColumn", "0", - "--filterColumn", "1")) - - // todo: a better test would be to get sorted vectors and compare rows instead of tokens, this might miss - // some error cases - val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable - tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens - val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable - tokenize(crossSimilarityLines) should contain theSameElementsAs CrossSimilarityTokens - - } - - test("ItemSimilarityDriver log-ish files") { - - val InFile = TmpDir + "in-file.log/" - val OutPath = TmpDir + "similarity-matrices/" - - val lines = Array( - "2014-06-23 14:46:53.115\tu1\tpurchase\trandom text\tiphone", - "2014-06-23 14:46:53.115\tu1\tpurchase\trandom text\tipad", - "2014-06-23 14:46:53.115\tu2\tpurchase\trandom text\tnexus", - "2014-06-23 14:46:53.115\tu2\tpurchase\trandom text\tgalaxy", - "2014-06-23 14:46:53.115\tu3\tpurchase\trandom text\tsurface", - "2014-06-23 14:46:53.115\tu4\tpurchase\trandom text\tiphone", - "2014-06-23 14:46:53.115\tu4\tpurchase\trandom text\tgalaxy", - "2014-06-23 14:46:53.115\tu1\tview\trandom text\tiphone", - "2014-06-23 14:46:53.115\tu1\tview\trandom text\tipad", - "2014-06-23 14:46:53.115\tu1\tview\trandom text\tnexus", - "2014-06-23 14:46:53.115\tu1\tview\trandom text\tgalaxy", - "2014-06-23 14:46:53.115\tu2\tview\trandom text\tiphone", - "2014-06-23 14:46:53.115\tu2\tview\trandom text\tipad", - "2014-06-23 14:46:53.115\tu2\tview\trandom text\tnexus", - "2014-06-23 14:46:53.115\tu2\tview\trandom text\tgalaxy", - "2014-06-23 14:46:53.115\tu3\tview\trandom text\tsurface", - "2014-06-23 14:46:53.115\tu3\tview\trandom text\tnexus", - "2014-06-23 14:46:53.115\tu4\tview\trandom text\tiphone", - "2014-06-23 14:46:53.115\tu4\tview\trandom text\tipad", - "2014-06-23 14:46:53.115\tu4\tview\trandom text\tgalaxy") - - // this will create multiple part-xxxxx files in the InFile dir but other tests will - // take account of one actual file - val linesRdd = mahoutCtx.parallelize(lines).saveAsTextFile(InFile) - - // local multi-threaded Spark with default HDFS - ItemSimilarityDriver.main(Array( - "--input", InFile, - "--output", OutPath, - "--master", masterUrl, - "--filter1", "purchase", - "--filter2", "view", - "--inDelim", "\t", - "--itemIDColumn", "4", - "--rowIDColumn", "1", - "--filterColumn", "2")) - - - val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable - tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens - val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable - tokenize(crossSimilarityLines) should contain theSameElementsAs CrossSimilarityTokens - - } - - test("ItemSimilarityDriver legacy supported file format") { - - val InDir = TmpDir + "in-dir/" - val InFilename = "in-file.tsv" - val InPath = InDir + InFilename - - val OutPath = TmpDir + "similarity-matrices" - - val lines = Array( - "0,0,1", - "0,1,1", - "1,2,1", - "1,3,1", - "2,4,1", - "3,0,1", - "3,3,1") - - val Answer = tokenize(Iterable( - "0\t1:1.7260924347106847", - "3\t2:1.7260924347106847", - "1\t0:1.7260924347106847", - "4", - "2\t3:1.7260924347106847")) - - // this creates one part-0000 file in the directory - mahoutCtx.parallelize(lines).coalesce(1, shuffle = true).saveAsTextFile(InDir) - - // to change from using part files to a single .tsv file we'll need to use HDFS - val fs = FileSystem.get(new Configuration()) - //rename part-00000 to something.tsv - fs.rename(new Path(InDir + "part-00000"), new Path(InPath)) - - // local multi-threaded Spark with default HDFS - ItemSimilarityDriver.main(Array( - "--input", InPath, - "--output", OutPath, - "--master", masterUrl)) - - val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable - tokenize(similarityLines) should contain theSameElementsAs Answer - - } - - test("ItemSimilarityDriver write search engine output") { - - val InDir = TmpDir + "in-dir/" - val InFilename = "in-file.tsv" - val InPath = InDir + InFilename - - val OutPath = TmpDir + "similarity-matrices" - - val lines = Array( - "0,0,1", - "0,1,1", - "1,2,1", - "1,3,1", - "2,4,1", - "3,0,1", - "3,3,1") - - val Answer = tokenize(Iterable( - "0\t1", - "3\t2", - "1\t0", - "4", - "2\t3")) - - // this creates one part-0000 file in the directory - mahoutCtx.parallelize(lines).coalesce(1, shuffle = true).saveAsTextFile(InDir) - - // to change from using part files to a single .tsv file we'll need to use HDFS - val fs = FileSystem.get(new Configuration()) - //rename part-00000 to something.tsv - fs.rename(new Path(InDir + "part-00000"), new Path(InPath)) - - // local multi-threaded Spark with default HDFS - ItemSimilarityDriver.main(Array( - "--input", InPath, - "--output", OutPath, - "--master", masterUrl, - "--omitStrength")) - - val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable - tokenize(similarityLines) should contain theSameElementsAs Answer - - } - - test("ItemSimilarityDriver recursive file discovery using filename patterns") { - //directory structure using the following - // tmp/data/m1.tsv - // tmp/data/more-data/another-dir/m2.tsv - val M1Lines = Array( - "u1\tpurchase\tiphone", - "u1\tpurchase\tipad", - "u2\tpurchase\tnexus", - "u2\tpurchase\tgalaxy", - "u3\tpurchase\tsurface", - "u4\tpurchase\tiphone", - "u4\tpurchase\tgalaxy") - - val M2Lines = Array( - "u1\tview\tiphone", - "u1\tview\tipad", - "u1\tview\tnexus", - "u1\tview\tgalaxy", - "u2\tview\tiphone", - "u2\tview\tipad", - "u2\tview\tnexus", - "u2\tview\tgalaxy", - "u3\tview\tsurface", - "u3\tview\tnexus", - "u4\tview\tiphone", - "u4\tview\tipad", - "u4\tview\tgalaxy") - - val InFilenameM1 = "m1.tsv" - val InDirM1 = TmpDir + "data/" - val InPathM1 = InDirM1 + InFilenameM1 - val InFilenameM2 = "m2.tsv" - val InDirM2 = TmpDir + "data/more-data/another-dir/" - val InPathM2 = InDirM2 + InFilenameM2 - - val InPathStart = TmpDir + "data/" - val OutPath = TmpDir + "similarity-matrices" - - // this creates one part-0000 file in the directory - mahoutCtx.parallelize(M1Lines).coalesce(1, shuffle = true).saveAsTextFile(InDirM1) - - // to change from using part files to a single .tsv file we'll need to use HDFS - val fs = FileSystem.get(new Configuration()) - //rename part-00000 to something.tsv - fs.rename(new Path(InDirM1 + "part-00000"), new Path(InPathM1)) - - // this creates one part-0000 file in the directory - mahoutCtx.parallelize(M2Lines).coalesce(1, shuffle = true).saveAsTextFile(InDirM2) - - // to change from using part files to a single .tsv file we'll need to use HDFS - //rename part-00000 to tmp/some-location/something.tsv - fs.rename(new Path(InDirM2 + "part-00000"), new Path(InPathM2)) - - // local multi-threaded Spark with default FS, suitable for build tests but need better location for data - - ItemSimilarityDriver.main(Array( - "--input", InPathStart, - "--output", OutPath, - "--master", masterUrl, - "--filter1", "purchase", - "--filter2", "view", - "--inDelim", "\t", - "--itemIDColumn", "2", - "--rowIDColumn", "0", - "--filterColumn", "1", - "--filenamePattern", "m..tsv", - "--recursive")) - - val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable - tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens - val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable - tokenize(crossSimilarityLines) should contain theSameElementsAs CrossSimilarityTokens - - } - - test("ItemSimilarityDriver, two input paths") { - - val InFile1 = TmpDir + "in-file1.csv/" //using part files, not single file - val InFile2 = TmpDir + "in-file2.csv/" //using part files, not single file - val OutPath = TmpDir + "similarity-matrices/" - - val lines = Array( - "u1,purchase,iphone", - "u1,purchase,ipad", - "u2,purchase,nexus", - "u2,purchase,galaxy", - "u3,purchase,surface", - "u4,purchase,iphone", - "u4,purchase,galaxy", - "u1,view,iphone", - "u1,view,ipad", - "u1,view,nexus", - "u1,view,galaxy", - "u2,view,iphone", - "u2,view,ipad", - "u2,view,nexus", - "u2,view,galaxy", - "u3,view,surface", - "u3,view,nexus", - "u4,view,iphone", - "u4,view,ipad", - "u4,view,galaxy") - - // this will create multiple part-xxxxx files in the InFile dir but other tests will - // take account of one actual file - val linesRdd1 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile1) - val linesRdd2 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile2) - - // local multi-threaded Spark with default HDFS - ItemSimilarityDriver.main(Array( - "--input", InFile1, - "--input2", InFile2, - "--output", OutPath, - "--master", masterUrl, - "--filter1", "purchase", - "--filter2", "view", - "--inDelim", ",", - "--itemIDColumn", "2", - "--rowIDColumn", "0", - "--filterColumn", "1")) - - val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable - tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens - val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable - tokenize(crossSimilarityLines) should contain theSameElementsAs CrossSimilarityTokens - - } - - test("ItemSimilarityDriver, two inputs of different dimensions") { - - val InFile1 = TmpDir + "in-file1.csv/" //using part files, not single file - val InFile2 = TmpDir + "in-file2.csv/" //using part files, not single file - val OutPath = TmpDir + "similarity-matrices/" - - val lines = Array( - "u1,purchase,iphone", - "u1,purchase,ipad", - "u2,purchase,nexus", - "u2,purchase,galaxy", - // remove one user so A'B will be of different dimensions - // ItemSimilarityDriver should create one unified user dictionary and so account for this - // discrepancy as a blank row: "u3,purchase,surface", - "u4,purchase,iphone", - "u4,purchase,galaxy", - "u1,view,iphone", - "u1,view,ipad", - "u1,view,nexus", - "u1,view,galaxy", - "u2,view,iphone", - "u2,view,ipad", - "u2,view,nexus", - "u2,view,galaxy", - "u3,view,surface", - "u3,view,nexus", - "u4,view,iphone", - "u4,view,ipad", - "u4,view,galaxy") - - val UnequalDimensionsSelfSimilarity = tokenize(Iterable( - "ipad\tiphone:1.7260924347106847", - "iphone\tipad:1.7260924347106847", - "nexus\tgalaxy:1.7260924347106847", - "galaxy\tnexus:1.7260924347106847")) - - //only surface purchase was removed so no cross-similarity for surface - val UnequalDimensionsCrossSimilarity = tokenize(Iterable( - "galaxy\tgalaxy:1.7260924347106847 iphone:1.7260924347106847 ipad:1.7260924347106847 nexus:1.7260924347106847", - "iphone\tgalaxy:1.7260924347106847 iphone:1.7260924347106847 ipad:1.7260924347106847 nexus:1.7260924347106847", - "ipad\tgalaxy:0.6795961471815897 iphone:0.6795961471815897 ipad:0.6795961471815897 nexus:0.6795961471815897", - "nexus\tiphone:0.6795961471815897 ipad:0.6795961471815897 nexus:0.6795961471815897 galaxy:0.6795961471815897")) - // this will create multiple part-xxxxx files in the InFile dir but other tests will - // take account of one actual file - val linesRdd1 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile1) - val linesRdd2 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile2) - - // local multi-threaded Spark with default HDFS - ItemSimilarityDriver.main(Array( - "--input", InFile1, - "--input2", InFile2, - "--output", OutPath, - "--master", masterUrl, - "--filter1", "purchase", - "--filter2", "view", - "--inDelim", ",", - "--itemIDColumn", "2", - "--rowIDColumn", "0", - "--filterColumn", "1")) - - val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable - val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable - tokenize(similarityLines) should contain theSameElementsAs UnequalDimensionsSelfSimilarity - tokenize(crossSimilarityLines) should contain theSameElementsAs UnequalDimensionsCrossSimilarity - - } - - test("ItemSimilarityDriver cross similarity two separate items spaces") { - /* cross-similarity with category views, same user space - phones tablets mobile_acc soap - u1 0 1 1 0 - u2 1 1 1 0 - u3 0 0 1 0 - u4 1 1 0 1 - */ - val InFile1 = TmpDir + "in-file1.csv/" //using part files, not single file - val InFile2 = TmpDir + "in-file2.csv/" //using part files, not single file - val OutPath = TmpDir + "similarity-matrices/" - - val lines = Array( - "u1,purchase,iphone", - "u1,purchase,ipad", - "u2,purchase,nexus", - "u2,purchase,galaxy", - "u3,purchase,surface", - "u4,purchase,iphone", - "u4,purchase,galaxy", - "u1,view,phones", - "u1,view,mobile_acc", - "u2,view,phones", - "u2,view,tablets", - "u2,view,mobile_acc", - "u3,view,mobile_acc", - "u4,view,phones", - "u4,view,tablets", - "u4,view,soap") - - val UnequalDimensionsCrossSimilarityLines = tokenize(Iterable( - "iphone\tmobile_acc:1.7260924347106847 soap:1.7260924347106847 phones:1.7260924347106847", - "surface\tmobile_acc:0.6795961471815897", - "nexus\ttablets:1.7260924347106847 mobile_acc:0.6795961471815897 phones:0.6795961471815897", - "galaxy\ttablets:5.545177444479561 soap:1.7260924347106847 phones:1.7260924347106847 " + - "mobile_acc:1.7260924347106847", - "ipad\tmobile_acc:0.6795961471815897 phones:0.6795961471815897")) - - // this will create multiple part-xxxxx files in the InFile dir but other tests will - // take account of one actual file - val linesRdd1 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile1) - val linesRdd2 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile2) - - // local multi-threaded Spark with default HDFS - ItemSimilarityDriver.main(Array( - "--input", InFile1, - "--input2", InFile2, - "--output", OutPath, - "--master", masterUrl, - "--filter1", "purchase", - "--filter2", "view", - "--inDelim", ",", - "--itemIDColumn", "2", - "--rowIDColumn", "0", - "--filterColumn", "1", - "--writeAllDatasets")) - - val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable - val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable - tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens - tokenize(crossSimilarityLines) should contain theSameElementsAs UnequalDimensionsCrossSimilarityLines - - } - - test("A.t %*% B after changing row cardinality of A") { - // todo: move to math tests but this is Spark specific - - val a = dense( - (1.0, 1.0)) - - val b = dense( - (1.0, 1.0), - (1.0, 1.0), - (1.0, 1.0)) - - val inCoreABiggertBAnswer = dense( - (1.0, 1.0), - (1.0, 1.0)) - - val drmA = drmParallelize(m = a, numPartitions = 2) - val drmB = drmParallelize(m = b, numPartitions = 2) - - // modified to return a new CheckpointedDrm so maintains immutability but still only increases the row cardinality - // by returning new CheckpointedDrmSpark[K](rdd, n, ncol, _cacheStorageLevel ) Hack for now. - val drmABigger = drmWrap[Int](drmA.rdd, 3, 2) - - - val ABiggertB = drmABigger.t %*% drmB - val inCoreABiggertB = ABiggertB.collect - - assert(inCoreABiggertB === inCoreABiggertBAnswer) - - val bp = 0 - } - - test("Changing row cardinality of an IndexedDataset") { - - val a = dense( - (1.0, 1.0)) - - val drmA = drmParallelize(m = a, numPartitions = 2) - val emptyIDs = new BiDictionary(new HashMap[String, Int]()) - val indexedDatasetA = new IndexedDatasetSpark(drmA, emptyIDs, emptyIDs) - val biggerIDSA = indexedDatasetA.newRowCardinality(5) - - assert(biggerIDSA.matrix.nrow == 5) - - } - - test("ItemSimilarityDriver cross similarity two separate items spaces, missing rows in B") { - /* cross-similarity with category views, same user space - phones tablets mobile_acc soap - u1 0 1 1 0 - u2 1 1 1 0 -removed ==> u3 0 0 1 0 - u4 1 1 0 1 - */ - val InFile1 = TmpDir + "in-file1.csv/" //using part files, not single file - val InFile2 = TmpDir + "in-file2.csv/" //using part files, not single file - val OutPath = TmpDir + "similarity-matrices/" - - val lines = Array( - "u1,purchase,iphone", - "u1,purchase,ipad", - "u2,purchase,nexus", - "u2,purchase,galaxy", - "u3,purchase,surface", - "u4,purchase,iphone", - "u4,purchase,galaxy", - "u1,view,phones", - "u1,view,mobile_acc", - "u2,view,phones", - "u2,view,tablets", - "u2,view,mobile_acc", - //"u3,view,mobile_acc",// if this line is removed the cross-cooccurrence should work - "u4,view,phones", - "u4,view,tablets", - "u4,view,soap") - - val UnequalDimensionsCrossSimilarityLines = tokenize(Iterable( - "galaxy\ttablets:5.545177444479561 soap:1.7260924347106847 phones:1.7260924347106847", - "ipad\tmobile_acc:1.7260924347106847 phones:0.6795961471815897", - "surface", - "nexus\tmobile_acc:1.7260924347106847 tablets:1.7260924347106847 phones:0.6795961471815897", - "iphone\tsoap:1.7260924347106847 phones:1.7260924347106847")) - - // this will create multiple part-xxxxx files in the InFile dir but other tests will - // take account of one actual file - val linesRdd1 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile1) - val linesRdd2 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile2) - - // local multi-threaded Spark with default HDFS - ItemSimilarityDriver.main(Array( - "--input", InFile1, - "--input2", InFile2, - "--output", OutPath, - "--master", masterUrl, - "--filter1", "purchase", - "--filter2", "view", - "--inDelim", ",", - "--itemIDColumn", "2", - "--rowIDColumn", "0", - "--filterColumn", "1", - "--writeAllDatasets")) - - val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable - val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable - tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens - tokenize(crossSimilarityLines) should contain theSameElementsAs UnequalDimensionsCrossSimilarityLines - } - - test("ItemSimilarityDriver cross similarity two separate items spaces, adding rows in B") { - /* cross-similarity with category views, same user space - phones tablets mobile_acc soap - u1 0 1 1 0 - u2 1 1 1 0 -removed ==> u3 0 0 1 0 - u4 1 1 0 1 - */ - val InFile1 = TmpDir + "in-file1.csv/" //using part files, not single file - val InFile2 = TmpDir + "in-file2.csv/" //using part files, not single file - val OutPath = TmpDir + "similarity-matrices/" - - val lines = Array( - "u1,purchase,iphone", - "u1,purchase,ipad", - "u2,purchase,nexus", - "u2,purchase,galaxy", - "u3,purchase,surface", - "u4,purchase,iphone", - "u4,purchase,galaxy", - "u1,view,phones", - "u1,view,mobile_acc", - "u2,view,phones", - "u2,view,tablets", - "u2,view,mobile_acc", - "u3,view,mobile_acc",// if this line is removed the cross-cooccurrence should work - "u4,view,phones", - "u4,view,tablets", - "u4,view,soap", - "u5,view,soap") - - val UnequalDimensionsSimilarityTokens = List( - "galaxy", - "nexus:2.231435513142097", - "iphone:0.13844293808390518", - "nexus", - "galaxy:2.231435513142097", - "ipad", - "iphone:2.231435513142097", - "surface", - "iphone", - "ipad:2.231435513142097", - "galaxy:0.13844293808390518") - - val UnequalDimensionsCrossSimilarityLines = List( - "galaxy", - "tablets:6.730116670092563", - "phones:2.9110316603236868", - "soap:0.13844293808390518", - "mobile_acc:0.13844293808390518", - "nexus", - "tablets:2.231435513142097", - "mobile_acc:1.184939225613002", - "phones:1.184939225613002", - "ipad", "mobile_acc:1.184939225613002", - "phones:1.184939225613002", - "surface", - "mobile_acc:1.184939225613002", - "iphone", - "phones:2.9110316603236868", - "soap:0.13844293808390518", - "tablets:0.13844293808390518", - "mobile_acc:0.13844293808390518") - - // this will create multiple part-xxxxx files in the InFile dir but other tests will - // take account of one actual file - val linesRdd1 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile1) - val linesRdd2 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile2) - - // local multi-threaded Spark with default HDFS - ItemSimilarityDriver.main(Array( - "--input", InFile1, - "--input2", InFile2, - "--output", OutPath, - "--master", masterUrl, - "--filter1", "purchase", - "--filter2", "view", - "--inDelim", ",", - "--itemIDColumn", "2", - "--rowIDColumn", "0", - "--filterColumn", "1", - "--writeAllDatasets")) - - val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable - val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable - tokenize(similarityLines) should contain theSameElementsAs UnequalDimensionsSimilarityTokens - tokenize(crossSimilarityLines) should contain theSameElementsAs UnequalDimensionsCrossSimilarityLines - } - - // convert into an Iterable of tokens for 'should contain theSameElementsAs Iterable' - def tokenize(a: Iterable[String]): Iterable[String] = { - var r: Iterable[String] = Iterable() - a.foreach { l => - l.split("\t").foreach { s => - r = r ++ s.split("[\t ]") - } - } - r - } - - override protected def beforeAll(configMap: ConfigMap) { - super.beforeAll(configMap) - ItemSimilarityDriver.useContext(mahoutCtx) - } - -} +///* +// * Licensed to the Apache Software Foundation (ASF) under one or more +// * contributor license agreements. See the NOTICE file distributed with +// * this work for additional information regarding copyright ownership. +// * The ASF licenses this file to You under the Apache License, Version 2.0 +// * (the "License"); you may not use this file except in compliance with +// * the License. You may obtain a copy of the License at +// * +// * http://www.apache.org/licenses/LICENSE-2.0 +// * +// * Unless required by applicable law or agreed to in writing, software +// * distributed under the License is distributed on an "AS IS" BASIS, +// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// * See the License for the specific language governing permissions and +// * limitations under the License. +// */ +// +//package org.apache.mahout.drivers +// +//import org.apache.hadoop.conf.Configuration +//import org.apache.hadoop.fs.{Path, FileSystem} +//import org.apache.mahout.math.indexeddataset.{BiDictionary, IndexedDataset} +//import org.apache.mahout.sparkbindings.indexeddataset.IndexedDatasetSpark +//import org.scalatest.{ConfigMap, FunSuite} +//import org.apache.mahout.sparkbindings._ +//import org.apache.mahout.sparkbindings.test.DistributedSparkSuite +//import org.apache.mahout.math.drm._ +//import org.apache.mahout.math.scalabindings._ +// +//import scala.collection.immutable.HashMap +// +////todo: take out, only for temp tests +// +//import org.apache.mahout.math.scalabindings._ +//import RLikeOps._ +//import org.apache.mahout.math.drm._ +//import RLikeDrmOps._ +//import scala.collection.JavaConversions._ +// +// +//class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite { +// +// /* +// final val matrixLLRCoocAtAControl = dense( +// (0.0, 0.6331745808516107, 0.0, 0.0, 0.0), +// (0.6331745808516107, 0.0, 0.0, 0.0, 0.0), +// (0.0, 0.0, 0.0, 0.6331745808516107, 0.0), +// (0.0, 0.0, 0.6331745808516107, 0.0, 0.0), +// (0.0, 0.0, 0.0, 0.0, 0.0)) +// +// final val matrixLLRCoocBtAControl = dense( +// (1.7260924347106847, 1.7260924347106847, 1.7260924347106847, 1.7260924347106847, 0.0), +// (0.6795961471815897, 0.6795961471815897, 0.6795961471815897, 0.6795961471815897, 0.0), +// (0.6795961471815897, 0.6795961471815897, 0.6795961471815897, 0.6795961471815897, 0.0), +// (1.7260924347106847, 1.7260924347106847, 1.7260924347106847, 1.7260924347106847, 0.0), +// (0.0, 0.0, 0.6795961471815897, 0.0, 4.498681156950466)) +// */ +// +// +// final val SelfSimilairtyLines = Iterable( +// "galaxy\tnexus:1.7260924347106847", +// "ipad\tiphone:1.7260924347106847", +// "nexus\tgalaxy:1.7260924347106847", +// "iphone\tipad:1.7260924347106847", +// "surface") +// +// val CrossSimilarityLines = Iterable( +// "iphone\tnexus:1.7260924347106847 iphone:1.7260924347106847 ipad:1.7260924347106847 galaxy:1.7260924347106847", +// "ipad\tnexus:0.6795961471815897 iphone:0.6795961471815897 ipad:0.6795961471815897 galaxy:0.6795961471815897", +// "nexus\tnexus:0.6795961471815897 iphone:0.6795961471815897 ipad:0.6795961471815897 galaxy:0.6795961471815897", +// "galaxy\tnexus:1.7260924347106847 iphone:1.7260924347106847 ipad:1.7260924347106847 galaxy:1.7260924347106847", +// "surface\tsurface:4.498681156950466 nexus:0.6795961471815897") +// +// // todo: a better test would be to sort each vector by itemID and compare rows, tokens misses some error cases +// final val SelfSimilairtyTokens = tokenize(Iterable( +// "galaxy\tnexus:1.7260924347106847", +// "ipad\tiphone:1.7260924347106847", +// "nexus\tgalaxy:1.7260924347106847", +// "iphone\tipad:1.7260924347106847", +// "surface")) +// +// val CrossSimilarityTokens = tokenize(Iterable( +// "iphone\tnexus:1.7260924347106847 iphone:1.7260924347106847 ipad:1.7260924347106847 galaxy:1.7260924347106847", +// "ipad\tnexus:0.6795961471815897 iphone:0.6795961471815897 ipad:0.6795961471815897 galaxy:0.6795961471815897", +// "nexus\tnexus:0.6795961471815897 iphone:0.6795961471815897 ipad:0.6795961471815897 galaxy:0.6795961471815897", +// "galaxy\tnexus:1.7260924347106847 iphone:1.7260924347106847 ipad:1.7260924347106847 galaxy:1.7260924347106847", +// "surface\tsurface:4.498681156950466 nexus:0.6795961471815897")) +// +// /* +// //Clustered Spark and HDFS, not a good everyday build test +// ItemSimilarityDriver.main(Array( +// "--input", "hdfs://occam4:54310/user/pat/spark-itemsimilarity/cf-data.txt", +// "--output", "hdfs://occam4:54310/user/pat/spark-itemsimilarity/similarityMatrices/", +// "--master", "spark://occam4:7077", +// "--filter1", "purchase", +// "--filter2", "view", +// "--inDelim", ",", +// "--itemIDColumn", "2", +// "--rowIDColumn", "0", +// "--filterColumn", "1")) +// */ +// // local multi-threaded Spark with HDFS using large dataset +// // not a good build test. +// /* +// ItemSimilarityDriver.main(Array( +// "--input", "hdfs://occam4:54310/user/pat/xrsj/ratings_data.txt", +// "--output", "hdfs://occam4:54310/user/pat/xrsj/similarityMatrices/", +// "--master", "local[4]", +// "--filter1", "purchase", +// "--filter2", "view", +// "--inDelim", ",", +// "--itemIDColumn", "2", +// "--rowIDColumn", "0", +// "--filterColumn", "1")) +// */ +// +// test("ItemSimilarityDriver, non-full-spec CSV") { +// +// val InFile = TmpDir + "in-file.csv/" //using part files, not single file +// val OutPath = TmpDir + "similarity-matrices/" +// +// val lines = Array( +// "u1,purchase,iphone", +// "u1,purchase,ipad", +// "u2,purchase,nexus", +// "u2,purchase,galaxy", +// "u3,purchase,surface", +// "u4,purchase,iphone", +// "u4,purchase,galaxy", +// "u1,view,iphone", +// "u1,view,ipad", +// "u1,view,nexus", +// "u1,view,galaxy", +// "u2,view,iphone", +// "u2,view,ipad", +// "u2,view,nexus", +// "u2,view,galaxy", +// "u3,view,surface", +// "u3,view,nexus", +// "u4,view,iphone", +// "u4,view,ipad", +// "u4,view,galaxy") +// +// // this will create multiple part-xxxxx files in the InFile dir but other tests will +// // take account of one actual file +// val linesRdd = mahoutCtx.parallelize(lines).saveAsTextFile(InFile) +// +// // local multi-threaded Spark with default HDFS +// ItemSimilarityDriver.main(Array( +// "--input", InFile, +// "--output", OutPath, +// "--master", masterUrl, +// "--filter1", "purchase", +// "--filter2", "view", +// "--inDelim", ",", +// "--itemIDColumn", "2", +// "--rowIDColumn", "0", +// "--filterColumn", "1", +// "--writeAllDatasets")) +// +// // todo: these comparisons rely on a sort producing the same lines, which could possibly +// // fail since the sort is on value and these can be the same for all items in a vector +// val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable +// tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens +// val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable +// tokenize(crossSimilarityLines) should contain theSameElementsAs CrossSimilarityTokens +// } +// +// +// +// test("ItemSimilarityDriver TSV ") { +// +// val InFile = TmpDir + "in-file.tsv/" +// val OutPath = TmpDir + "similarity-matrices/" +// +// val lines = Array( +// "u1\tpurchase\tiphone", +// "u1\tpurchase\tipad", +// "u2\tpurchase\tnexus", +// "u2\tpurchase\tgalaxy", +// "u3\tpurchase\tsurface", +// "u4\tpurchase\tiphone", +// "u4\tpurchase\tgalaxy", +// "u1\tview\tiphone", +// "u1\tview\tipad", +// "u1\tview\tnexus", +// "u1\tview\tgalaxy", +// "u2\tview\tiphone", +// "u2\tview\tipad", +// "u2\tview\tnexus", +// "u2\tview\tgalaxy", +// "u3\tview\tsurface", +// "u3\tview\tnexus", +// "u4\tview\tiphone", +// "u4\tview\tipad", +// "u4\tview\tgalaxy") +// +// // this will create multiple part-xxxxx files in the InFile dir but other tests will +// // take account of one actual file +// val linesRdd = mahoutCtx.parallelize(lines).saveAsTextFile(InFile) +// +// // local multi-threaded Spark with default HDFS +// ItemSimilarityDriver.main(Array( +// "--input", InFile, +// "--output", OutPath, +// "--master", masterUrl, +// "--filter1", "purchase", +// "--filter2", "view", +// "--inDelim", "[,\t]", +// "--itemIDColumn", "2", +// "--rowIDColumn", "0", +// "--filterColumn", "1")) +// +// // todo: a better test would be to get sorted vectors and compare rows instead of tokens, this might miss +// // some error cases +// val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable +// tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens +// val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable +// tokenize(crossSimilarityLines) should contain theSameElementsAs CrossSimilarityTokens +// +// } +// +// test("ItemSimilarityDriver log-ish files") { +// +// val InFile = TmpDir + "in-file.log/" +// val OutPath = TmpDir + "similarity-matrices/" +// +// val lines = Array( +// "2014-06-23 14:46:53.115\tu1\tpurchase\trandom text\tiphone", +// "2014-06-23 14:46:53.115\tu1\tpurchase\trandom text\tipad", +// "2014-06-23 14:46:53.115\tu2\tpurchase\trandom text\tnexus", +// "2014-06-23 14:46:53.115\tu2\tpurchase\trandom text\tgalaxy", +// "2014-06-23 14:46:53.115\tu3\tpurchase\trandom text\tsurface", +// "2014-06-23 14:46:53.115\tu4\tpurchase\trandom text\tiphone", +// "2014-06-23 14:46:53.115\tu4\tpurchase\trandom text\tgalaxy", +// "2014-06-23 14:46:53.115\tu1\tview\trandom text\tiphone", +// "2014-06-23 14:46:53.115\tu1\tview\trandom text\tipad", +// "2014-06-23 14:46:53.115\tu1\tview\trandom text\tnexus", +// "2014-06-23 14:46:53.115\tu1\tview\trandom text\tgalaxy", +// "2014-06-23 14:46:53.115\tu2\tview\trandom text\tiphone", +// "2014-06-23 14:46:53.115\tu2\tview\trandom text\tipad", +// "2014-06-23 14:46:53.115\tu2\tview\trandom text\tnexus", +// "2014-06-23 14:46:53.115\tu2\tview\trandom text\tgalaxy", +// "2014-06-23 14:46:53.115\tu3\tview\trandom text\tsurface", +// "2014-06-23 14:46:53.115\tu3\tview\trandom text\tnexus", +// "2014-06-23 14:46:53.115\tu4\tview\trandom text\tiphone", +// "2014-06-23 14:46:53.115\tu4\tview\trandom text\tipad", +// "2014-06-23 14:46:53.115\tu4\tview\trandom text\tgalaxy") +// +// // this will create multiple part-xxxxx files in the InFile dir but other tests will +// // take account of one actual file +// val linesRdd = mahoutCtx.parallelize(lines).saveAsTextFile(InFile) +// +// // local multi-threaded Spark with default HDFS +// ItemSimilarityDriver.main(Array( +// "--input", InFile, +// "--output", OutPath, +// "--master", masterUrl, +// "--filter1", "purchase", +// "--filter2", "view", +// "--inDelim", "\t", +// "--itemIDColumn", "4", +// "--rowIDColumn", "1", +// "--filterColumn", "2")) +// +// +// val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable +// tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens +// val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable +// tokenize(crossSimilarityLines) should contain theSameElementsAs CrossSimilarityTokens +// +// } +// +// test("ItemSimilarityDriver legacy supported file format") { +// +// val InDir = TmpDir + "in-dir/" +// val InFilename = "in-file.tsv" +// val InPath = InDir + InFilename +// +// val OutPath = TmpDir + "similarity-matrices" +// +// val lines = Array( +// "0,0,1", +// "0,1,1", +// "1,2,1", +// "1,3,1", +// "2,4,1", +// "3,0,1", +// "3,3,1") +// +// val Answer = tokenize(Iterable( +// "0\t1:1.7260924347106847", +// "3\t2:1.7260924347106847", +// "1\t0:1.7260924347106847", +// "4", +// "2\t3:1.7260924347106847")) +// +// // this creates one part-0000 file in the directory +// mahoutCtx.parallelize(lines).coalesce(1, shuffle = true).saveAsTextFile(InDir) +// +// // to change from using part files to a single .tsv file we'll need to use HDFS +// val fs = FileSystem.get(new Configuration()) +// //rename part-00000 to something.tsv +// fs.rename(new Path(InDir + "part-00000"), new Path(InPath)) +// +// // local multi-threaded Spark with default HDFS +// ItemSimilarityDriver.main(Array( +// "--input", InPath, +// "--output", OutPath, +// "--master", masterUrl)) +// +// val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable +// tokenize(similarityLines) should contain theSameElementsAs Answer +// +// } +// +// test("ItemSimilarityDriver write search engine output") { +// +// val InDir = TmpDir + "in-dir/" +// val InFilename = "in-file.tsv" +// val InPath = InDir + InFilename +// +// val OutPath = TmpDir + "similarity-matrices" +// +// val lines = Array( +// "0,0,1", +// "0,1,1", +// "1,2,1", +// "1,3,1", +// "2,4,1", +// "3,0,1", +// "3,3,1") +// +// val Answer = tokenize(Iterable( +// "0\t1", +// "3\t2", +// "1\t0", +// "4", +// "2\t3")) +// +// // this creates one part-0000 file in the directory +// mahoutCtx.parallelize(lines).coalesce(1, shuffle = true).saveAsTextFile(InDir) +// +// // to change from using part files to a single .tsv file we'll need to use HDFS +// val fs = FileSystem.get(new Configuration()) +// //rename part-00000 to something.tsv +// fs.rename(new Path(InDir + "part-00000"), new Path(InPath)) +// +// // local multi-threaded Spark with default HDFS +// ItemSimilarityDriver.main(Array( +// "--input", InPath, +// "--output", OutPath, +// "--master", masterUrl, +// "--omitStrength")) +// +// val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable +// tokenize(similarityLines) should contain theSameElementsAs Answer +// +// } +// +// test("ItemSimilarityDriver recursive file discovery using filename patterns") { +// //directory structure using the following +// // tmp/data/m1.tsv +// // tmp/data/more-data/another-dir/m2.tsv +// val M1Lines = Array( +// "u1\tpurchase\tiphone", +// "u1\tpurchase\tipad", +// "u2\tpurchase\tnexus", +// "u2\tpurchase\tgalaxy", +// "u3\tpurchase\tsurface", +// "u4\tpurchase\tiphone", +// "u4\tpurchase\tgalaxy") +// +// val M2Lines = Array( +// "u1\tview\tiphone", +// "u1\tview\tipad", +// "u1\tview\tnexus", +// "u1\tview\tgalaxy", +// "u2\tview\tiphone", +// "u2\tview\tipad", +// "u2\tview\tnexus", +// "u2\tview\tgalaxy", +// "u3\tview\tsurface", +// "u3\tview\tnexus", +// "u4\tview\tiphone", +// "u4\tview\tipad", +// "u4\tview\tgalaxy") +// +// val InFilenameM1 = "m1.tsv" +// val InDirM1 = TmpDir + "data/" +// val InPathM1 = InDirM1 + InFilenameM1 +// val InFilenameM2 = "m2.tsv" +// val InDirM2 = TmpDir + "data/more-data/another-dir/" +// val InPathM2 = InDirM2 + InFilenameM2 +// +// val InPathStart = TmpDir + "data/" +// val OutPath = TmpDir + "similarity-matrices" +// +// // this creates one part-0000 file in the directory +// mahoutCtx.parallelize(M1Lines).coalesce(1, shuffle = true).saveAsTextFile(InDirM1) +// +// // to change from using part files to a single .tsv file we'll need to use HDFS +// val fs = FileSystem.get(new Configuration()) +// //rename part-00000 to something.tsv +// fs.rename(new Path(InDirM1 + "part-00000"), new Path(InPathM1)) +// +// // this creates one part-0000 file in the directory +// mahoutCtx.parallelize(M2Lines).coalesce(1, shuffle = true).saveAsTextFile(InDirM2) +// +// // to change from using part files to a single .tsv file we'll need to use HDFS +// //rename part-00000 to tmp/some-location/something.tsv +// fs.rename(new Path(InDirM2 + "part-00000"), new Path(InPathM2)) +// +// // local multi-threaded Spark with default FS, suitable for build tests but need better location for data +// +// ItemSimilarityDriver.main(Array( +// "--input", InPathStart, +// "--output", OutPath, +// "--master", masterUrl, +// "--filter1", "purchase", +// "--filter2", "view", +// "--inDelim", "\t", +// "--itemIDColumn", "2", +// "--rowIDColumn", "0", +// "--filterColumn", "1", +// "--filenamePattern", "m..tsv", +// "--recursive")) +// +// val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable +// tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens +// val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable +// tokenize(crossSimilarityLines) should contain theSameElementsAs CrossSimilarityTokens +// +// } +// +// test("ItemSimilarityDriver, two input paths") { +// +// val InFile1 = TmpDir + "in-file1.csv/" //using part files, not single file +// val InFile2 = TmpDir + "in-file2.csv/" //using part files, not single file +// val OutPath = TmpDir + "similarity-matrices/" +// +// val lines = Array( +// "u1,purchase,iphone", +// "u1,purchase,ipad", +// "u2,purchase,nexus", +// "u2,purchase,galaxy", +// "u3,purchase,surface", +// "u4,purchase,iphone", +// "u4,purchase,galaxy", +// "u1,view,iphone", +// "u1,view,ipad", +// "u1,view,nexus", +// "u1,view,galaxy", +// "u2,view,iphone", +// "u2,view,ipad", +// "u2,view,nexus", +// "u2,view,galaxy", +// "u3,view,surface", +// "u3,view,nexus", +// "u4,view,iphone", +// "u4,view,ipad", +// "u4,view,galaxy") +// +// // this will create multiple part-xxxxx files in the InFile dir but other tests will +// // take account of one actual file +// val linesRdd1 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile1) +// val linesRdd2 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile2) +// +// // local multi-threaded Spark with default HDFS +// ItemSimilarityDriver.main(Array( +// "--input", InFile1, +// "--input2", InFile2, +// "--output", OutPath, +// "--master", masterUrl, +// "--filter1", "purchase", +// "--filter2", "view", +// "--inDelim", ",", +// "--itemIDColumn", "2", +// "--rowIDColumn", "0", +// "--filterColumn", "1")) +// +// val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable +// tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens +// val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable +// tokenize(crossSimilarityLines) should contain theSameElementsAs CrossSimilarityTokens +// +// } +// +// test("ItemSimilarityDriver, two inputs of different dimensions") { +// +// val InFile1 = TmpDir + "in-file1.csv/" //using part files, not single file +// val InFile2 = TmpDir + "in-file2.csv/" //using part files, not single file +// val OutPath = TmpDir + "similarity-matrices/" +// +// val lines = Array( +// "u1,purchase,iphone", +// "u1,purchase,ipad", +// "u2,purchase,nexus", +// "u2,purchase,galaxy", +// // remove one user so A'B will be of different dimensions +// // ItemSimilarityDriver should create one unified user dictionary and so account for this +// // discrepancy as a blank row: "u3,purchase,surface", +// "u4,purchase,iphone", +// "u4,purchase,galaxy", +// "u1,view,iphone", +// "u1,view,ipad", +// "u1,view,nexus", +// "u1,view,galaxy", +// "u2,view,iphone", +// "u2,view,ipad", +// "u2,view,nexus", +// "u2,view,galaxy", +// "u3,view,surface", +// "u3,view,nexus", +// "u4,view,iphone", +// "u4,view,ipad", +// "u4,view,galaxy") +// +// val UnequalDimensionsSelfSimilarity = tokenize(Iterable( +// "ipad\tiphone:1.7260924347106847", +// "iphone\tipad:1.7260924347106847", +// "nexus\tgalaxy:1.7260924347106847", +// "galaxy\tnexus:1.7260924347106847")) +// +// //only surface purchase was removed so no cross-similarity for surface +// val UnequalDimensionsCrossSimilarity = tokenize(Iterable( +// "galaxy\tgalaxy:1.7260924347106847 iphone:1.7260924347106847 ipad:1.7260924347106847 nexus:1.7260924347106847", +// "iphone\tgalaxy:1.7260924347106847 iphone:1.7260924347106847 ipad:1.7260924347106847 nexus:1.7260924347106847", +// "ipad\tgalaxy:0.6795961471815897 iphone:0.6795961471815897 ipad:0.6795961471815897 nexus:0.6795961471815897", +// "nexus\tiphone:0.6795961471815897 ipad:0.6795961471815897 nexus:0.6795961471815897 galaxy:0.6795961471815897")) +// // this will create multiple part-xxxxx files in the InFile dir but other tests will +// // take account of one actual file +// val linesRdd1 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile1) +// val linesRdd2 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile2) +// +// // local multi-threaded Spark with default HDFS +// ItemSimilarityDriver.main(Array( +// "--input", InFile1, +// "--input2", InFile2, +// "--output", OutPath, +// "--master", masterUrl, +// "--filter1", "purchase", +// "--filter2", "view", +// "--inDelim", ",", +// "--itemIDColumn", "2", +// "--rowIDColumn", "0", +// "--filterColumn", "1")) +// +// val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable +// val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable +// tokenize(similarityLines) should contain theSameElementsAs UnequalDimensionsSelfSimilarity +// tokenize(crossSimilarityLines) should contain theSameElementsAs UnequalDimensionsCrossSimilarity +// +// } +// +// test("ItemSimilarityDriver cross similarity two separate items spaces") { +// /* cross-similarity with category views, same user space +// phones tablets mobile_acc soap +// u1 0 1 1 0 +// u2 1 1 1 0 +// u3 0 0 1 0 +// u4 1 1 0 1 +// */ +// val InFile1 = TmpDir + "in-file1.csv/" //using part files, not single file +// val InFile2 = TmpDir + "in-file2.csv/" //using part files, not single file +// val OutPath = TmpDir + "similarity-matrices/" +// +// val lines = Array( +// "u1,purchase,iphone", +// "u1,purchase,ipad", +// "u2,purchase,nexus", +// "u2,purchase,galaxy", +// "u3,purchase,surface", +// "u4,purchase,iphone", +// "u4,purchase,galaxy", +// "u1,view,phones", +// "u1,view,mobile_acc", +// "u2,view,phones", +// "u2,view,tablets", +// "u2,view,mobile_acc", +// "u3,view,mobile_acc", +// "u4,view,phones", +// "u4,view,tablets", +// "u4,view,soap") +// +// val UnequalDimensionsCrossSimilarityLines = tokenize(Iterable( +// "iphone\tmobile_acc:1.7260924347106847 soap:1.7260924347106847 phones:1.7260924347106847", +// "surface\tmobile_acc:0.6795961471815897", +// "nexus\ttablets:1.7260924347106847 mobile_acc:0.6795961471815897 phones:0.6795961471815897", +// "galaxy\ttablets:5.545177444479561 soap:1.7260924347106847 phones:1.7260924347106847 " + +// "mobile_acc:1.7260924347106847", +// "ipad\tmobile_acc:0.6795961471815897 phones:0.6795961471815897")) +// +// // this will create multiple part-xxxxx files in the InFile dir but other tests will +// // take account of one actual file +// val linesRdd1 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile1) +// val linesRdd2 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile2) +// +// // local multi-threaded Spark with default HDFS +// ItemSimilarityDriver.main(Array( +// "--input", InFile1, +// "--input2", InFile2, +// "--output", OutPath, +// "--master", masterUrl, +// "--filter1", "purchase", +// "--filter2", "view", +// "--inDelim", ",", +// "--itemIDColumn", "2", +// "--rowIDColumn", "0", +// "--filterColumn", "1", +// "--writeAllDatasets")) +// +// val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable +// val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable +// tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens +// tokenize(crossSimilarityLines) should contain theSameElementsAs UnequalDimensionsCrossSimilarityLines +// +// } +// +// test("A.t %*% B after changing row cardinality of A") { +// // todo: move to math tests but this is Spark specific +// +// val a = dense( +// (1.0, 1.0)) +// +// val b = dense( +// (1.0, 1.0), +// (1.0, 1.0), +// (1.0, 1.0)) +// +// val inCoreABiggertBAnswer = dense( +// (1.0, 1.0), +// (1.0, 1.0)) +// +// val drmA = drmParallelize(m = a, numPartitions = 2) +// val drmB = drmParallelize(m = b, numPartitions = 2) +// +// // modified to return a new CheckpointedDrm so maintains immutability but still only increases the row cardinality +// // by returning new CheckpointedDrmSpark[K](rdd, n, ncol, _cacheStorageLevel ) Hack for now. +// val drmABigger = drmWrap[Int](drmA.rdd, 3, 2) +// +// +// val ABiggertB = drmABigger.t %*% drmB +// val inCoreABiggertB = ABiggertB.collect +// +// assert(inCoreABiggertB === inCoreABiggertBAnswer) +// +// val bp = 0 +// } +// +// test("Changing row cardinality of an IndexedDataset") { +// +// val a = dense( +// (1.0, 1.0)) +// +// val drmA = drmParallelize(m = a, numPartitions = 2) +// val emptyIDs = new BiDictionary(new HashMap[String, Int]()) +// val indexedDatasetA = new IndexedDatasetSpark(drmA, emptyIDs, emptyIDs) +// val biggerIDSA = indexedDatasetA.newRowCardinality(5) +// +// assert(biggerIDSA.matrix.nrow == 5) +// +// } +// +// test("ItemSimilarityDriver cross similarity two separate items spaces, missing rows in B") { +// /* cross-similarity with category views, same user space +// phones tablets mobile_acc soap +// u1 0 1 1 0 +// u2 1 1 1 0 +//removed ==> u3 0 0 1 0 +// u4 1 1 0 1 +// */ +// val InFile1 = TmpDir + "in-file1.csv/" //using part files, not single file +// val InFile2 = TmpDir + "in-file2.csv/" //using part files, not single file +// val OutPath = TmpDir + "similarity-matrices/" +// +// val lines = Array( +// "u1,purchase,iphone", +// "u1,purchase,ipad", +// "u2,purchase,nexus", +// "u2,purchase,galaxy", +// "u3,purchase,surface", +// "u4,purchase,iphone", +// "u4,purchase,galaxy", +// "u1,view,phones", +// "u1,view,mobile_acc", +// "u2,view,phones", +// "u2,view,tablets", +// "u2,view,mobile_acc", +// //"u3,view,mobile_acc",// if this line is removed the cross-cooccurrence should work +// "u4,view,phones", +// "u4,view,tablets", +// "u4,view,soap") +// +// val UnequalDimensionsCrossSimilarityLines = tokenize(Iterable( +// "galaxy\ttablets:5.545177444479561 soap:1.7260924347106847 phones:1.7260924347106847", +// "ipad\tmobile_acc:1.7260924347106847 phones:0.6795961471815897", +// "surface", +// "nexus\tmobile_acc:1.7260924347106847 tablets:1.7260924347106847 phones:0.6795961471815897", +// "iphone\tsoap:1.7260924347106847 phones:1.7260924347106847")) +// +// // this will create multiple part-xxxxx files in the InFile dir but other tests will +// // take account of one actual file +// val linesRdd1 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile1) +// val linesRdd2 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile2) +// +// // local multi-threaded Spark with default HDFS +// ItemSimilarityDriver.main(Array( +// "--input", InFile1, +// "--input2", InFile2, +// "--output", OutPath, +// "--master", masterUrl, +// "--filter1", "purchase", +// "--filter2", "view", +// "--inDelim", ",", +// "--itemIDColumn", "2", +// "--rowIDColumn", "0", +// "--filterColumn", "1", +// "--writeAllDatasets")) +// +// val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable +// val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable +// tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens +// tokenize(crossSimilarityLines) should contain theSameElementsAs UnequalDimensionsCrossSimilarityLines +// } +// +// test("ItemSimilarityDriver cross similarity two separate items spaces, adding rows in B") { +// /* cross-similarity with category views, same user space +// phones tablets mobile_acc soap +// u1 0 1 1 0 +// u2 1 1 1 0 +//removed ==> u3 0 0 1 0 +// u4 1 1 0 1 +// */ +// val InFile1 = TmpDir + "in-file1.csv/" //using part files, not single file +// val InFile2 = TmpDir + "in-file2.csv/" //using part files, not single file +// val OutPath = TmpDir + "similarity-matrices/" +// +// val lines = Array( +// "u1,purchase,iphone", +// "u1,purchase,ipad", +// "u2,purchase,nexus", +// "u2,purchase,galaxy", +// "u3,purchase,surface", +// "u4,purchase,iphone", +// "u4,purchase,galaxy", +// "u1,view,phones", +// "u1,view,mobile_acc", +// "u2,view,phones", +// "u2,view,tablets", +// "u2,view,mobile_acc", +// "u3,view,mobile_acc",// if this line is removed the cross-cooccurrence should work +// "u4,view,phones", +// "u4,view,tablets", +// "u4,view,soap", +// "u5,view,soap") +// +// val UnequalDimensionsSimilarityTokens = List( +// "galaxy", +// "nexus:2.231435513142097", +// "iphone:0.13844293808390518", +// "nexus", +// "galaxy:2.231435513142097", +// "ipad", +// "iphone:2.231435513142097", +// "surface", +// "iphone", +// "ipad:2.231435513142097", +// "galaxy:0.13844293808390518") +// +// val UnequalDimensionsCrossSimilarityLines = List( +// "galaxy", +// "tablets:6.730116670092563", +// "phones:2.9110316603236868", +// "soap:0.13844293808390518", +// "mobile_acc:0.13844293808390518", +// "nexus", +// "tablets:2.231435513142097", +// "mobile_acc:1.184939225613002", +// "phones:1.184939225613002", +// "ipad", "mobile_acc:1.184939225613002", +// "phones:1.184939225613002", +// "surface", +// "mobile_acc:1.184939225613002", +// "iphone", +// "phones:2.9110316603236868", +// "soap:0.13844293808390518", +// "tablets:0.13844293808390518", +// "mobile_acc:0.13844293808390518") +// +// // this will create multiple part-xxxxx files in the InFile dir but other tests will +// // take account of one actual file +// val linesRdd1 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile1) +// val linesRdd2 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile2) +// +// // local multi-threaded Spark with default HDFS +// ItemSimilarityDriver.main(Array( +// "--input", InFile1, +// "--input2", InFile2, +// "--output", OutPath, +// "--master", masterUrl, +// "--filter1", "purchase", +// "--filter2", "view", +// "--inDelim", ",", +// "--itemIDColumn", "2", +// "--rowIDColumn", "0", +// "--filterColumn", "1", +// "--writeAllDatasets")) +// +// val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable +// val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable +// tokenize(similarityLines) should contain theSameElementsAs UnequalDimensionsSimilarityTokens +// tokenize(crossSimilarityLines) should contain theSameElementsAs UnequalDimensionsCrossSimilarityLines +// } +// +// // convert into an Iterable of tokens for 'should contain theSameElementsAs Iterable' +// def tokenize(a: Iterable[String]): Iterable[String] = { +// var r: Iterable[String] = Iterable() +// a.foreach { l => +// l.split("\t").foreach { s => +// r = r ++ s.split("[\t ]") +// } +// } +// r +// } +// +// override protected def beforeAll(configMap: ConfigMap) { +// super.beforeAll(configMap) +// ItemSimilarityDriver.useContext(mahoutCtx) +// } +// +//}
http://git-wip-us.apache.org/repos/asf/mahout/blob/034790cc/spark/src/test/scala/org/apache/mahout/drivers/RowSimilarityDriverSuite.scala ---------------------------------------------------------------------- diff --git a/spark/src/test/scala/org/apache/mahout/drivers/RowSimilarityDriverSuite.scala b/spark/src/test/scala/org/apache/mahout/drivers/RowSimilarityDriverSuite.scala index eccddb1..e6f917c 100644 --- a/spark/src/test/scala/org/apache/mahout/drivers/RowSimilarityDriverSuite.scala +++ b/spark/src/test/scala/org/apache/mahout/drivers/RowSimilarityDriverSuite.scala @@ -1,139 +1,139 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.drivers - -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{FileSystem, Path} -import org.apache.mahout.math.drm.RLikeDrmOps._ -import org.apache.mahout.math.drm._ -import org.apache.mahout.math.scalabindings.RLikeOps._ -import org.apache.mahout.math.scalabindings._ -import org.apache.mahout.sparkbindings._ -import org.apache.mahout.sparkbindings.test.DistributedSparkSuite -import org.scalatest.{ConfigMap, FunSuite} - - -class RowSimilarityDriverSuite extends FunSuite with DistributedSparkSuite { - - val TextDocs = Array( - "doc1\tNow is the time for all good people to come to aid of their party", - "doc2\tNow is the time for all good people to come to aid of their country", - "doc3\tNow is the time for all good people to come to aid of their hood", - "doc4\tNow is the time for all good people to come to aid of their friends", - "doc5\tNow is the time for all good people to come to aid of their looser brother", - "doc6\tThe quick brown fox jumped over the lazy dog", - "doc7\tThe quick brown fox jumped over the lazy boy", - "doc8\tThe quick brown fox jumped over the lazy cat", - "doc9\tThe quick brown fox jumped over the lazy wolverine", - "doc10\tThe quick brown fox jumped over the lazy cantelope")// yes that's spelled correctly. - - test("RowSimilarityDriver text docs no strengths") { - - val firstFiveSimDocsTokens = tokenize(Iterable( - "doc1\tdoc3 doc2 doc4 doc5")) - - val lastFiveSimDocsTokens = tokenize(Iterable( - "doc6\tdoc8 doc10 doc7 doc9")) - - val inDir = TmpDir + "in-dir/" - val inFilename = "in-file.tsv" - val inPath = inDir + inFilename - - val outPath = TmpDir + "similarity-matrices/" - - - // this creates one part-0000 file in the directory - mahoutCtx.parallelize(TextDocs).coalesce(1, shuffle=true).saveAsTextFile(inDir) - - // to change from using part files to a single .tsv file we'll need to use HDFS - val fs = FileSystem.get(new Configuration()) - //rename part-00000 to something.tsv - fs.rename(new Path(inDir + "part-00000"), new Path(inPath)) - - // local multi-threaded Spark with default HDFS - RowSimilarityDriver.main(Array( - "--input", inPath, - "--output", outPath, - "--omitStrength", - "--maxSimilaritiesPerRow", "4", // would get all docs similar if we didn't limit them - "--master", masterUrl)) - - val simLines = mahoutCtx.textFile(outPath).collect - simLines.foreach { line => - val lineTokens = line.split("[\t ]") - if (lineTokens.contains("doc1") ) // docs are two flavors so if only 4 similarities it will effectively classify - lineTokens should contain theSameElementsAs firstFiveSimDocsTokens - else - lineTokens should contain theSameElementsAs lastFiveSimDocsTokens - } - - } - - test("RowSimilarityDriver text docs") { - - val simDocsTokens = tokenize(Iterable( - "doc1\tdoc3:27.87301122947484 doc2:27.87301122947484 doc4:27.87301122947484 doc5:23.42278065550721", - "doc2\tdoc4:27.87301122947484 doc3:27.87301122947484 doc1:27.87301122947484 doc5:23.42278065550721", - "doc3\tdoc4:27.87301122947484 doc2:27.87301122947484 doc1:27.87301122947484 doc5:23.42278065550721", - "doc4\tdoc3:27.87301122947484 doc2:27.87301122947484 doc1:27.87301122947484 doc5:23.42278065550721", - "doc5\tdoc4:23.42278065550721 doc2:23.42278065550721 doc3:23.42278065550721 doc1:23.42278065550721", - "doc6\tdoc8:22.936393049704463 doc10:22.936393049704463 doc7:22.936393049704463 doc9:22.936393049704463", - "doc7\tdoc6:22.936393049704463 doc8:22.936393049704463 doc10:22.936393049704463 doc9:22.936393049704463", - "doc8\tdoc6:22.936393049704463 doc10:22.936393049704463 doc7:22.936393049704463 doc9:22.936393049704463", - "doc9\tdoc6:22.936393049704463 doc8:22.936393049704463 doc10:22.936393049704463 doc7:22.936393049704463", - "doc10\tdoc6:22.936393049704463 doc8:22.936393049704463 doc7:22.936393049704463 doc9:22.936393049704463")) - - val inDir = TmpDir + "in-dir/" - val inFilename = "in-file.tsv" - val inPath = inDir + inFilename - - val outPath = TmpDir + "similarity-matrix/" - - - // this creates one part-0000 file in the directory - mahoutCtx.parallelize(TextDocs).coalesce(1, shuffle=true).saveAsTextFile(inDir) - - // to change from using part files to a single .tsv file we'll need to use HDFS - val fs = FileSystem.get(new Configuration()) - //rename part-00000 to something.tsv - fs.rename(new Path(inDir + "part-00000"), new Path(inPath)) - - // local multi-threaded Spark with default HDFS - RowSimilarityDriver.main(Array( - "--input", inPath, - "--output", outPath, - "--maxSimilaritiesPerRow", "4", // would get all docs similar if we didn't limit them - "--master", masterUrl)) - - val simLines = mahoutCtx.textFile(outPath).collect - tokenize(simLines) should contain theSameElementsAs simDocsTokens - } - - // convert into an Iterable of tokens for 'should contain theSameElementsAs Iterable' - def tokenize(a: Iterable[String], splitString: String = "[\t ]"): Iterable[String] = { - var r: Iterable[String] = Iterable() - a.foreach ( l => r = r ++ l.split(splitString) ) - r - } - - override protected def beforeAll(configMap: ConfigMap) { - super.beforeAll(configMap) - RowSimilarityDriver.useContext(mahoutCtx) - } - -} +///* +// * Licensed to the Apache Software Foundation (ASF) under one or more +// * contributor license agreements. See the NOTICE file distributed with +// * this work for additional information regarding copyright ownership. +// * The ASF licenses this file to You under the Apache License, Version 2.0 +// * (the "License"); you may not use this file except in compliance with +// * the License. You may obtain a copy of the License at +// * +// * http://www.apache.org/licenses/LICENSE-2.0 +// * +// * Unless required by applicable law or agreed to in writing, software +// * distributed under the License is distributed on an "AS IS" BASIS, +// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// * See the License for the specific language governing permissions and +// * limitations under the License. +// */ +// +//package org.apache.mahout.drivers +// +//import org.apache.hadoop.conf.Configuration +//import org.apache.hadoop.fs.{FileSystem, Path} +//import org.apache.mahout.math.drm.RLikeDrmOps._ +//import org.apache.mahout.math.drm._ +//import org.apache.mahout.math.scalabindings.RLikeOps._ +//import org.apache.mahout.math.scalabindings._ +//import org.apache.mahout.sparkbindings._ +//import org.apache.mahout.sparkbindings.test.DistributedSparkSuite +//import org.scalatest.{ConfigMap, FunSuite} +// +// +//class RowSimilarityDriverSuite extends FunSuite with DistributedSparkSuite { +// +// val TextDocs = Array( +// "doc1\tNow is the time for all good people to come to aid of their party", +// "doc2\tNow is the time for all good people to come to aid of their country", +// "doc3\tNow is the time for all good people to come to aid of their hood", +// "doc4\tNow is the time for all good people to come to aid of their friends", +// "doc5\tNow is the time for all good people to come to aid of their looser brother", +// "doc6\tThe quick brown fox jumped over the lazy dog", +// "doc7\tThe quick brown fox jumped over the lazy boy", +// "doc8\tThe quick brown fox jumped over the lazy cat", +// "doc9\tThe quick brown fox jumped over the lazy wolverine", +// "doc10\tThe quick brown fox jumped over the lazy cantelope")// yes that's spelled correctly. +// +// test("RowSimilarityDriver text docs no strengths") { +// +// val firstFiveSimDocsTokens = tokenize(Iterable( +// "doc1\tdoc3 doc2 doc4 doc5")) +// +// val lastFiveSimDocsTokens = tokenize(Iterable( +// "doc6\tdoc8 doc10 doc7 doc9")) +// +// val inDir = TmpDir + "in-dir/" +// val inFilename = "in-file.tsv" +// val inPath = inDir + inFilename +// +// val outPath = TmpDir + "similarity-matrices/" +// +// +// // this creates one part-0000 file in the directory +// mahoutCtx.parallelize(TextDocs).coalesce(1, shuffle=true).saveAsTextFile(inDir) +// +// // to change from using part files to a single .tsv file we'll need to use HDFS +// val fs = FileSystem.get(new Configuration()) +// //rename part-00000 to something.tsv +// fs.rename(new Path(inDir + "part-00000"), new Path(inPath)) +// +// // local multi-threaded Spark with default HDFS +// RowSimilarityDriver.main(Array( +// "--input", inPath, +// "--output", outPath, +// "--omitStrength", +// "--maxSimilaritiesPerRow", "4", // would get all docs similar if we didn't limit them +// "--master", masterUrl)) +// +// val simLines = mahoutCtx.textFile(outPath).collect +// simLines.foreach { line => +// val lineTokens = line.split("[\t ]") +// if (lineTokens.contains("doc1") ) // docs are two flavors so if only 4 similarities it will effectively classify +// lineTokens should contain theSameElementsAs firstFiveSimDocsTokens +// else +// lineTokens should contain theSameElementsAs lastFiveSimDocsTokens +// } +// +// } +// +// test("RowSimilarityDriver text docs") { +// +// val simDocsTokens = tokenize(Iterable( +// "doc1\tdoc3:27.87301122947484 doc2:27.87301122947484 doc4:27.87301122947484 doc5:23.42278065550721", +// "doc2\tdoc4:27.87301122947484 doc3:27.87301122947484 doc1:27.87301122947484 doc5:23.42278065550721", +// "doc3\tdoc4:27.87301122947484 doc2:27.87301122947484 doc1:27.87301122947484 doc5:23.42278065550721", +// "doc4\tdoc3:27.87301122947484 doc2:27.87301122947484 doc1:27.87301122947484 doc5:23.42278065550721", +// "doc5\tdoc4:23.42278065550721 doc2:23.42278065550721 doc3:23.42278065550721 doc1:23.42278065550721", +// "doc6\tdoc8:22.936393049704463 doc10:22.936393049704463 doc7:22.936393049704463 doc9:22.936393049704463", +// "doc7\tdoc6:22.936393049704463 doc8:22.936393049704463 doc10:22.936393049704463 doc9:22.936393049704463", +// "doc8\tdoc6:22.936393049704463 doc10:22.936393049704463 doc7:22.936393049704463 doc9:22.936393049704463", +// "doc9\tdoc6:22.936393049704463 doc8:22.936393049704463 doc10:22.936393049704463 doc7:22.936393049704463", +// "doc10\tdoc6:22.936393049704463 doc8:22.936393049704463 doc7:22.936393049704463 doc9:22.936393049704463")) +// +// val inDir = TmpDir + "in-dir/" +// val inFilename = "in-file.tsv" +// val inPath = inDir + inFilename +// +// val outPath = TmpDir + "similarity-matrix/" +// +// +// // this creates one part-0000 file in the directory +// mahoutCtx.parallelize(TextDocs).coalesce(1, shuffle=true).saveAsTextFile(inDir) +// +// // to change from using part files to a single .tsv file we'll need to use HDFS +// val fs = FileSystem.get(new Configuration()) +// //rename part-00000 to something.tsv +// fs.rename(new Path(inDir + "part-00000"), new Path(inPath)) +// +// // local multi-threaded Spark with default HDFS +// RowSimilarityDriver.main(Array( +// "--input", inPath, +// "--output", outPath, +// "--maxSimilaritiesPerRow", "4", // would get all docs similar if we didn't limit them +// "--master", masterUrl)) +// +// val simLines = mahoutCtx.textFile(outPath).collect +// tokenize(simLines) should contain theSameElementsAs simDocsTokens +// } +// +// // convert into an Iterable of tokens for 'should contain theSameElementsAs Iterable' +// def tokenize(a: Iterable[String], splitString: String = "[\t ]"): Iterable[String] = { +// var r: Iterable[String] = Iterable() +// a.foreach ( l => r = r ++ l.split(splitString) ) +// r +// } +// +// override protected def beforeAll(configMap: ConfigMap) { +// super.beforeAll(configMap) +// RowSimilarityDriver.useContext(mahoutCtx) +// } +// +//} http://git-wip-us.apache.org/repos/asf/mahout/blob/034790cc/spark/src/test/scala/org/apache/mahout/drivers/TextDelimitedReaderWriterSuite.scala ---------------------------------------------------------------------- diff --git a/spark/src/test/scala/org/apache/mahout/drivers/TextDelimitedReaderWriterSuite.scala b/spark/src/test/scala/org/apache/mahout/drivers/TextDelimitedReaderWriterSuite.scala index 5d92cca..8e56f1e 100644 --- a/spark/src/test/scala/org/apache/mahout/drivers/TextDelimitedReaderWriterSuite.scala +++ b/spark/src/test/scala/org/apache/mahout/drivers/TextDelimitedReaderWriterSuite.scala @@ -1,53 +1,53 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.drivers - -import org.apache.mahout.math.indexeddataset.DefaultIndexedDatasetReadSchema -import org.apache.mahout.sparkbindings._ -import org.apache.mahout.sparkbindings.test.DistributedSparkSuite -import org.scalatest.FunSuite - -import scala.collection.JavaConversions._ - -class TextDelimitedReaderWriterSuite extends FunSuite with DistributedSparkSuite { - test("indexedDatasetDFSRead should read sparse matrix file with null rows") { - val OutFile = TmpDir + "similarity-matrices/part-00000" - - val lines = Array( - "galaxy\tnexus:1.0", - "ipad\tiphone:2.0", - "nexus\tgalaxy:3.0", - "iphone\tipad:4.0", - "surface" - ) - val linesRdd = mahoutCtx.parallelize(lines).saveAsTextFile(OutFile) - - val data = mahoutCtx.indexedDatasetDFSRead(OutFile, DefaultIndexedDatasetReadSchema) - - data.rowIDs.toMap.keySet should equal(Set("galaxy", "ipad", "nexus", "iphone", "surface")) - data.columnIDs.toMap.keySet should equal(Set("nexus", "iphone", "galaxy", "ipad")) - - val a = data.matrix.collect - a.setRowLabelBindings(mapAsJavaMap(data.rowIDs.toMap).asInstanceOf[java.util.Map[java.lang.String, java.lang.Integer]]) - a.setColumnLabelBindings(mapAsJavaMap(data.columnIDs.toMap).asInstanceOf[java.util.Map[java.lang.String, java.lang.Integer]]) - a.get("galaxy", "nexus") should equal(1.0) - a.get("ipad", "iphone") should equal(2.0) - a.get("nexus", "galaxy") should equal(3.0) - a.get("iphone", "ipad") should equal(4.0) - } -} +///* +// * Licensed to the Apache Software Foundation (ASF) under one or more +// * contributor license agreements. See the NOTICE file distributed with +// * this work for additional information regarding copyright ownership. +// * The ASF licenses this file to You under the Apache License, Version 2.0 +// * (the "License"); you may not use this file except in compliance with +// * the License. You may obtain a copy of the License at +// * +// * http://www.apache.org/licenses/LICENSE-2.0 +// * +// * Unless required by applicable law or agreed to in writing, software +// * distributed under the License is distributed on an "AS IS" BASIS, +// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// * See the License for the specific language governing permissions and +// * limitations under the License. +// */ +// +//package org.apache.mahout.drivers +// +//import org.apache.mahout.math.indexeddataset.DefaultIndexedDatasetReadSchema +//import org.apache.mahout.sparkbindings._ +//import org.apache.mahout.sparkbindings.test.DistributedSparkSuite +//import org.scalatest.FunSuite +// +//import scala.collection.JavaConversions._ +// +//class TextDelimitedReaderWriterSuite extends FunSuite with DistributedSparkSuite { +// test("indexedDatasetDFSRead should read sparse matrix file with null rows") { +// val OutFile = TmpDir + "similarity-matrices/part-00000" +// +// val lines = Array( +// "galaxy\tnexus:1.0", +// "ipad\tiphone:2.0", +// "nexus\tgalaxy:3.0", +// "iphone\tipad:4.0", +// "surface" +// ) +// val linesRdd = mahoutCtx.parallelize(lines).saveAsTextFile(OutFile) +// +// val data = mahoutCtx.indexedDatasetDFSRead(OutFile, DefaultIndexedDatasetReadSchema) +// +// data.rowIDs.toMap.keySet should equal(Set("galaxy", "ipad", "nexus", "iphone", "surface")) +// data.columnIDs.toMap.keySet should equal(Set("nexus", "iphone", "galaxy", "ipad")) +// +// val a = data.matrix.collect +// a.setRowLabelBindings(mapAsJavaMap(data.rowIDs.toMap).asInstanceOf[java.util.Map[java.lang.String, java.lang.Integer]]) +// a.setColumnLabelBindings(mapAsJavaMap(data.columnIDs.toMap).asInstanceOf[java.util.Map[java.lang.String, java.lang.Integer]]) +// a.get("galaxy", "nexus") should equal(1.0) +// a.get("ipad", "iphone") should equal(2.0) +// a.get("nexus", "galaxy") should equal(3.0) +// a.get("iphone", "ipad") should equal(4.0) +// } +//} http://git-wip-us.apache.org/repos/asf/mahout/blob/034790cc/spark/src/test/scala/org/apache/mahout/sparkbindings/SparkBindingsSuite.scala ---------------------------------------------------------------------- diff --git a/spark/src/test/scala/org/apache/mahout/sparkbindings/SparkBindingsSuite.scala b/spark/src/test/scala/org/apache/mahout/sparkbindings/SparkBindingsSuite.scala index 61244a1..dece685 100644 --- a/spark/src/test/scala/org/apache/mahout/sparkbindings/SparkBindingsSuite.scala +++ b/spark/src/test/scala/org/apache/mahout/sparkbindings/SparkBindingsSuite.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.mahout.sparkbindings import java.io.{Closeable, File} @@ -8,9 +25,7 @@ import org.scalatest.FunSuite import scala.collection._ -/** - * @author dmitriy - */ + class SparkBindingsSuite extends FunSuite with DistributedSparkSuite { // This test will succeed only when MAHOUT_HOME is set in the environment. So we keep it for @@ -26,7 +41,8 @@ class SparkBindingsSuite extends FunSuite with DistributedSparkSuite { } mahoutJars.size should be > 0 - mahoutJars.size shouldBe 4 + // this will depend on the viennacl profile. + // mahoutJars.size shouldBe 4 } finally { IOUtilsScala.close(closeables) } http://git-wip-us.apache.org/repos/asf/mahout/blob/034790cc/spark/src/test/scala/org/apache/mahout/sparkbindings/test/DistributedSparkSuite.scala ---------------------------------------------------------------------- diff --git a/spark/src/test/scala/org/apache/mahout/sparkbindings/test/DistributedSparkSuite.scala b/spark/src/test/scala/org/apache/mahout/sparkbindings/test/DistributedSparkSuite.scala index 4c75e75..48d84f8 100644 --- a/spark/src/test/scala/org/apache/mahout/sparkbindings/test/DistributedSparkSuite.scala +++ b/spark/src/test/scala/org/apache/mahout/sparkbindings/test/DistributedSparkSuite.scala @@ -33,7 +33,7 @@ trait DistributedSparkSuite extends DistributedMahoutSuite with LoggerConfigurat protected var masterUrl = null.asInstanceOf[String] protected def initContext() { - masterUrl = System.getProperties.getOrElse("test.spark.master", "local[3]") + masterUrl = System.getProperties.getOrElse("test.spark.master", "local[1]") val isLocal = masterUrl.startsWith("local") mahoutCtx = mahoutSparkContext(masterUrl = this.masterUrl, appName = "MahoutUnitTests",
