Repository: mahout Updated Branches: refs/heads/master a2cbd1570 -> 67f3a65cf
MAHOUT-1912: MAHOUT-1912: CLI driver tests not working with vienniacl. cloase apache/mahout#283 Project: http://git-wip-us.apache.org/repos/asf/mahout/repo Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/67f3a65c Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/67f3a65c Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/67f3a65c Branch: refs/heads/master Commit: 67f3a65cf6b68a2c6ac34130ab16d1a118006f45 Parents: a2cbd15 Author: Andrew Palumbo <[email protected]> Authored: Sat Feb 25 15:31:21 2017 -0800 Committer: Andrew Palumbo <[email protected]> Committed: Sat Feb 25 16:11:40 2017 -0800 ---------------------------------------------------------------------- .../drivers/ItemSimilarityDriverSuite.scala | 1664 +++++++++--------- .../drivers/RowSimilarityDriverSuite.scala | 278 +-- .../TextDelimitedReaderWriterSuite.scala | 106 +- 3 files changed, 1024 insertions(+), 1024 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mahout/blob/67f3a65c/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala ---------------------------------------------------------------------- diff --git a/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala b/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala index fc84577..628d981 100644 --- a/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala +++ b/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala @@ -1,832 +1,832 @@ -///* -// * Licensed to the Apache Software Foundation (ASF) under one or more -// * contributor license agreements. See the NOTICE file distributed with -// * this work for additional information regarding copyright ownership. -// * The ASF licenses this file to You under the Apache License, Version 2.0 -// * (the "License"); you may not use this file except in compliance with -// * the License. You may obtain a copy of the License at -// * -// * http://www.apache.org/licenses/LICENSE-2.0 -// * -// * Unless required by applicable law or agreed to in writing, software -// * distributed under the License is distributed on an "AS IS" BASIS, -// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// * See the License for the specific language governing permissions and -// * limitations under the License. -// */ -// -//package org.apache.mahout.drivers -// -//import org.apache.hadoop.conf.Configuration -//import org.apache.hadoop.fs.{Path, FileSystem} -//import org.apache.mahout.math.indexeddataset.{BiDictionary, IndexedDataset} -//import org.apache.mahout.sparkbindings.indexeddataset.IndexedDatasetSpark -//import org.scalatest.{ConfigMap, FunSuite} -//import org.apache.mahout.sparkbindings._ -//import org.apache.mahout.sparkbindings.test.DistributedSparkSuite -//import org.apache.mahout.math.drm._ -//import org.apache.mahout.math.scalabindings._ -// -//import scala.collection.immutable.HashMap -// -////todo: take out, only for temp tests -// -//import org.apache.mahout.math.scalabindings._ -//import RLikeOps._ -//import org.apache.mahout.math.drm._ -//import RLikeDrmOps._ -//import scala.collection.JavaConversions._ -// -// -//class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite { -// -// /* -// final val matrixLLRCoocAtAControl = dense( -// (0.0, 0.6331745808516107, 0.0, 0.0, 0.0), -// (0.6331745808516107, 0.0, 0.0, 0.0, 0.0), -// (0.0, 0.0, 0.0, 0.6331745808516107, 0.0), -// (0.0, 0.0, 0.6331745808516107, 0.0, 0.0), -// (0.0, 0.0, 0.0, 0.0, 0.0)) -// -// final val matrixLLRCoocBtAControl = dense( -// (1.7260924347106847, 1.7260924347106847, 1.7260924347106847, 1.7260924347106847, 0.0), -// (0.6795961471815897, 0.6795961471815897, 0.6795961471815897, 0.6795961471815897, 0.0), -// (0.6795961471815897, 0.6795961471815897, 0.6795961471815897, 0.6795961471815897, 0.0), -// (1.7260924347106847, 1.7260924347106847, 1.7260924347106847, 1.7260924347106847, 0.0), -// (0.0, 0.0, 0.6795961471815897, 0.0, 4.498681156950466)) -// */ -// -// -// final val SelfSimilairtyLines = Iterable( -// "galaxy\tnexus:1.7260924347106847", -// "ipad\tiphone:1.7260924347106847", -// "nexus\tgalaxy:1.7260924347106847", -// "iphone\tipad:1.7260924347106847", -// "surface") -// -// val CrossSimilarityLines = Iterable( -// "iphone\tnexus:1.7260924347106847 iphone:1.7260924347106847 ipad:1.7260924347106847 galaxy:1.7260924347106847", -// "ipad\tnexus:0.6795961471815897 iphone:0.6795961471815897 ipad:0.6795961471815897 galaxy:0.6795961471815897", -// "nexus\tnexus:0.6795961471815897 iphone:0.6795961471815897 ipad:0.6795961471815897 galaxy:0.6795961471815897", -// "galaxy\tnexus:1.7260924347106847 iphone:1.7260924347106847 ipad:1.7260924347106847 galaxy:1.7260924347106847", -// "surface\tsurface:4.498681156950466 nexus:0.6795961471815897") -// -// // todo: a better test would be to sort each vector by itemID and compare rows, tokens misses some error cases -// final val SelfSimilairtyTokens = tokenize(Iterable( -// "galaxy\tnexus:1.7260924347106847", -// "ipad\tiphone:1.7260924347106847", -// "nexus\tgalaxy:1.7260924347106847", -// "iphone\tipad:1.7260924347106847", -// "surface")) -// -// val CrossSimilarityTokens = tokenize(Iterable( -// "iphone\tnexus:1.7260924347106847 iphone:1.7260924347106847 ipad:1.7260924347106847 galaxy:1.7260924347106847", -// "ipad\tnexus:0.6795961471815897 iphone:0.6795961471815897 ipad:0.6795961471815897 galaxy:0.6795961471815897", -// "nexus\tnexus:0.6795961471815897 iphone:0.6795961471815897 ipad:0.6795961471815897 galaxy:0.6795961471815897", -// "galaxy\tnexus:1.7260924347106847 iphone:1.7260924347106847 ipad:1.7260924347106847 galaxy:1.7260924347106847", -// "surface\tsurface:4.498681156950466 nexus:0.6795961471815897")) -// -// /* -// //Clustered Spark and HDFS, not a good everyday build test -// ItemSimilarityDriver.main(Array( -// "--input", "hdfs://occam4:54310/user/pat/spark-itemsimilarity/cf-data.txt", -// "--output", "hdfs://occam4:54310/user/pat/spark-itemsimilarity/similarityMatrices/", -// "--master", "spark://occam4:7077", -// "--filter1", "purchase", -// "--filter2", "view", -// "--inDelim", ",", -// "--itemIDColumn", "2", -// "--rowIDColumn", "0", -// "--filterColumn", "1")) -// */ -// // local multi-threaded Spark with HDFS using large dataset -// // not a good build test. -// /* -// ItemSimilarityDriver.main(Array( -// "--input", "hdfs://occam4:54310/user/pat/xrsj/ratings_data.txt", -// "--output", "hdfs://occam4:54310/user/pat/xrsj/similarityMatrices/", -// "--master", "local[4]", -// "--filter1", "purchase", -// "--filter2", "view", -// "--inDelim", ",", -// "--itemIDColumn", "2", -// "--rowIDColumn", "0", -// "--filterColumn", "1")) -// */ -// -// test("ItemSimilarityDriver, non-full-spec CSV") { -// -// val InFile = TmpDir + "in-file.csv/" //using part files, not single file -// val OutPath = TmpDir + "similarity-matrices/" -// -// val lines = Array( -// "u1,purchase,iphone", -// "u1,purchase,ipad", -// "u2,purchase,nexus", -// "u2,purchase,galaxy", -// "u3,purchase,surface", -// "u4,purchase,iphone", -// "u4,purchase,galaxy", -// "u1,view,iphone", -// "u1,view,ipad", -// "u1,view,nexus", -// "u1,view,galaxy", -// "u2,view,iphone", -// "u2,view,ipad", -// "u2,view,nexus", -// "u2,view,galaxy", -// "u3,view,surface", -// "u3,view,nexus", -// "u4,view,iphone", -// "u4,view,ipad", -// "u4,view,galaxy") -// -// // this will create multiple part-xxxxx files in the InFile dir but other tests will -// // take account of one actual file -// val linesRdd = mahoutCtx.parallelize(lines).saveAsTextFile(InFile) -// -// // local multi-threaded Spark with default HDFS -// ItemSimilarityDriver.main(Array( -// "--input", InFile, -// "--output", OutPath, -// "--master", masterUrl, -// "--filter1", "purchase", -// "--filter2", "view", -// "--inDelim", ",", -// "--itemIDColumn", "2", -// "--rowIDColumn", "0", -// "--filterColumn", "1", -// "--writeAllDatasets")) -// -// // todo: these comparisons rely on a sort producing the same lines, which could possibly -// // fail since the sort is on value and these can be the same for all items in a vector -// val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable -// tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens -// val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable -// tokenize(crossSimilarityLines) should contain theSameElementsAs CrossSimilarityTokens -// } -// -// -// -// test("ItemSimilarityDriver TSV ") { -// -// val InFile = TmpDir + "in-file.tsv/" -// val OutPath = TmpDir + "similarity-matrices/" -// -// val lines = Array( -// "u1\tpurchase\tiphone", -// "u1\tpurchase\tipad", -// "u2\tpurchase\tnexus", -// "u2\tpurchase\tgalaxy", -// "u3\tpurchase\tsurface", -// "u4\tpurchase\tiphone", -// "u4\tpurchase\tgalaxy", -// "u1\tview\tiphone", -// "u1\tview\tipad", -// "u1\tview\tnexus", -// "u1\tview\tgalaxy", -// "u2\tview\tiphone", -// "u2\tview\tipad", -// "u2\tview\tnexus", -// "u2\tview\tgalaxy", -// "u3\tview\tsurface", -// "u3\tview\tnexus", -// "u4\tview\tiphone", -// "u4\tview\tipad", -// "u4\tview\tgalaxy") -// -// // this will create multiple part-xxxxx files in the InFile dir but other tests will -// // take account of one actual file -// val linesRdd = mahoutCtx.parallelize(lines).saveAsTextFile(InFile) -// -// // local multi-threaded Spark with default HDFS -// ItemSimilarityDriver.main(Array( -// "--input", InFile, -// "--output", OutPath, -// "--master", masterUrl, -// "--filter1", "purchase", -// "--filter2", "view", -// "--inDelim", "[,\t]", -// "--itemIDColumn", "2", -// "--rowIDColumn", "0", -// "--filterColumn", "1")) -// -// // todo: a better test would be to get sorted vectors and compare rows instead of tokens, this might miss -// // some error cases -// val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable -// tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens -// val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable -// tokenize(crossSimilarityLines) should contain theSameElementsAs CrossSimilarityTokens -// -// } -// -// test("ItemSimilarityDriver log-ish files") { -// -// val InFile = TmpDir + "in-file.log/" -// val OutPath = TmpDir + "similarity-matrices/" -// -// val lines = Array( -// "2014-06-23 14:46:53.115\tu1\tpurchase\trandom text\tiphone", -// "2014-06-23 14:46:53.115\tu1\tpurchase\trandom text\tipad", -// "2014-06-23 14:46:53.115\tu2\tpurchase\trandom text\tnexus", -// "2014-06-23 14:46:53.115\tu2\tpurchase\trandom text\tgalaxy", -// "2014-06-23 14:46:53.115\tu3\tpurchase\trandom text\tsurface", -// "2014-06-23 14:46:53.115\tu4\tpurchase\trandom text\tiphone", -// "2014-06-23 14:46:53.115\tu4\tpurchase\trandom text\tgalaxy", -// "2014-06-23 14:46:53.115\tu1\tview\trandom text\tiphone", -// "2014-06-23 14:46:53.115\tu1\tview\trandom text\tipad", -// "2014-06-23 14:46:53.115\tu1\tview\trandom text\tnexus", -// "2014-06-23 14:46:53.115\tu1\tview\trandom text\tgalaxy", -// "2014-06-23 14:46:53.115\tu2\tview\trandom text\tiphone", -// "2014-06-23 14:46:53.115\tu2\tview\trandom text\tipad", -// "2014-06-23 14:46:53.115\tu2\tview\trandom text\tnexus", -// "2014-06-23 14:46:53.115\tu2\tview\trandom text\tgalaxy", -// "2014-06-23 14:46:53.115\tu3\tview\trandom text\tsurface", -// "2014-06-23 14:46:53.115\tu3\tview\trandom text\tnexus", -// "2014-06-23 14:46:53.115\tu4\tview\trandom text\tiphone", -// "2014-06-23 14:46:53.115\tu4\tview\trandom text\tipad", -// "2014-06-23 14:46:53.115\tu4\tview\trandom text\tgalaxy") -// -// // this will create multiple part-xxxxx files in the InFile dir but other tests will -// // take account of one actual file -// val linesRdd = mahoutCtx.parallelize(lines).saveAsTextFile(InFile) -// -// // local multi-threaded Spark with default HDFS -// ItemSimilarityDriver.main(Array( -// "--input", InFile, -// "--output", OutPath, -// "--master", masterUrl, -// "--filter1", "purchase", -// "--filter2", "view", -// "--inDelim", "\t", -// "--itemIDColumn", "4", -// "--rowIDColumn", "1", -// "--filterColumn", "2")) -// -// -// val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable -// tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens -// val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable -// tokenize(crossSimilarityLines) should contain theSameElementsAs CrossSimilarityTokens -// -// } -// -// test("ItemSimilarityDriver legacy supported file format") { -// -// val InDir = TmpDir + "in-dir/" -// val InFilename = "in-file.tsv" -// val InPath = InDir + InFilename -// -// val OutPath = TmpDir + "similarity-matrices" -// -// val lines = Array( -// "0,0,1", -// "0,1,1", -// "1,2,1", -// "1,3,1", -// "2,4,1", -// "3,0,1", -// "3,3,1") -// -// val Answer = tokenize(Iterable( -// "0\t1:1.7260924347106847", -// "3\t2:1.7260924347106847", -// "1\t0:1.7260924347106847", -// "4", -// "2\t3:1.7260924347106847")) -// -// // this creates one part-0000 file in the directory -// mahoutCtx.parallelize(lines).coalesce(1, shuffle = true).saveAsTextFile(InDir) -// -// // to change from using part files to a single .tsv file we'll need to use HDFS -// val fs = FileSystem.get(new Configuration()) -// //rename part-00000 to something.tsv -// fs.rename(new Path(InDir + "part-00000"), new Path(InPath)) -// -// // local multi-threaded Spark with default HDFS -// ItemSimilarityDriver.main(Array( -// "--input", InPath, -// "--output", OutPath, -// "--master", masterUrl)) -// -// val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable -// tokenize(similarityLines) should contain theSameElementsAs Answer -// -// } -// -// test("ItemSimilarityDriver write search engine output") { -// -// val InDir = TmpDir + "in-dir/" -// val InFilename = "in-file.tsv" -// val InPath = InDir + InFilename -// -// val OutPath = TmpDir + "similarity-matrices" -// -// val lines = Array( -// "0,0,1", -// "0,1,1", -// "1,2,1", -// "1,3,1", -// "2,4,1", -// "3,0,1", -// "3,3,1") -// -// val Answer = tokenize(Iterable( -// "0\t1", -// "3\t2", -// "1\t0", -// "4", -// "2\t3")) -// -// // this creates one part-0000 file in the directory -// mahoutCtx.parallelize(lines).coalesce(1, shuffle = true).saveAsTextFile(InDir) -// -// // to change from using part files to a single .tsv file we'll need to use HDFS -// val fs = FileSystem.get(new Configuration()) -// //rename part-00000 to something.tsv -// fs.rename(new Path(InDir + "part-00000"), new Path(InPath)) -// -// // local multi-threaded Spark with default HDFS -// ItemSimilarityDriver.main(Array( -// "--input", InPath, -// "--output", OutPath, -// "--master", masterUrl, -// "--omitStrength")) -// -// val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable -// tokenize(similarityLines) should contain theSameElementsAs Answer -// -// } -// -// test("ItemSimilarityDriver recursive file discovery using filename patterns") { -// //directory structure using the following -// // tmp/data/m1.tsv -// // tmp/data/more-data/another-dir/m2.tsv -// val M1Lines = Array( -// "u1\tpurchase\tiphone", -// "u1\tpurchase\tipad", -// "u2\tpurchase\tnexus", -// "u2\tpurchase\tgalaxy", -// "u3\tpurchase\tsurface", -// "u4\tpurchase\tiphone", -// "u4\tpurchase\tgalaxy") -// -// val M2Lines = Array( -// "u1\tview\tiphone", -// "u1\tview\tipad", -// "u1\tview\tnexus", -// "u1\tview\tgalaxy", -// "u2\tview\tiphone", -// "u2\tview\tipad", -// "u2\tview\tnexus", -// "u2\tview\tgalaxy", -// "u3\tview\tsurface", -// "u3\tview\tnexus", -// "u4\tview\tiphone", -// "u4\tview\tipad", -// "u4\tview\tgalaxy") -// -// val InFilenameM1 = "m1.tsv" -// val InDirM1 = TmpDir + "data/" -// val InPathM1 = InDirM1 + InFilenameM1 -// val InFilenameM2 = "m2.tsv" -// val InDirM2 = TmpDir + "data/more-data/another-dir/" -// val InPathM2 = InDirM2 + InFilenameM2 -// -// val InPathStart = TmpDir + "data/" -// val OutPath = TmpDir + "similarity-matrices" -// -// // this creates one part-0000 file in the directory -// mahoutCtx.parallelize(M1Lines).coalesce(1, shuffle = true).saveAsTextFile(InDirM1) -// -// // to change from using part files to a single .tsv file we'll need to use HDFS -// val fs = FileSystem.get(new Configuration()) -// //rename part-00000 to something.tsv -// fs.rename(new Path(InDirM1 + "part-00000"), new Path(InPathM1)) -// -// // this creates one part-0000 file in the directory -// mahoutCtx.parallelize(M2Lines).coalesce(1, shuffle = true).saveAsTextFile(InDirM2) -// -// // to change from using part files to a single .tsv file we'll need to use HDFS -// //rename part-00000 to tmp/some-location/something.tsv -// fs.rename(new Path(InDirM2 + "part-00000"), new Path(InPathM2)) -// -// // local multi-threaded Spark with default FS, suitable for build tests but need better location for data -// -// ItemSimilarityDriver.main(Array( -// "--input", InPathStart, -// "--output", OutPath, -// "--master", masterUrl, -// "--filter1", "purchase", -// "--filter2", "view", -// "--inDelim", "\t", -// "--itemIDColumn", "2", -// "--rowIDColumn", "0", -// "--filterColumn", "1", -// "--filenamePattern", "m..tsv", -// "--recursive")) -// -// val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable -// tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens -// val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable -// tokenize(crossSimilarityLines) should contain theSameElementsAs CrossSimilarityTokens -// -// } -// -// test("ItemSimilarityDriver, two input paths") { -// -// val InFile1 = TmpDir + "in-file1.csv/" //using part files, not single file -// val InFile2 = TmpDir + "in-file2.csv/" //using part files, not single file -// val OutPath = TmpDir + "similarity-matrices/" -// -// val lines = Array( -// "u1,purchase,iphone", -// "u1,purchase,ipad", -// "u2,purchase,nexus", -// "u2,purchase,galaxy", -// "u3,purchase,surface", -// "u4,purchase,iphone", -// "u4,purchase,galaxy", -// "u1,view,iphone", -// "u1,view,ipad", -// "u1,view,nexus", -// "u1,view,galaxy", -// "u2,view,iphone", -// "u2,view,ipad", -// "u2,view,nexus", -// "u2,view,galaxy", -// "u3,view,surface", -// "u3,view,nexus", -// "u4,view,iphone", -// "u4,view,ipad", -// "u4,view,galaxy") -// -// // this will create multiple part-xxxxx files in the InFile dir but other tests will -// // take account of one actual file -// val linesRdd1 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile1) -// val linesRdd2 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile2) -// -// // local multi-threaded Spark with default HDFS -// ItemSimilarityDriver.main(Array( -// "--input", InFile1, -// "--input2", InFile2, -// "--output", OutPath, -// "--master", masterUrl, -// "--filter1", "purchase", -// "--filter2", "view", -// "--inDelim", ",", -// "--itemIDColumn", "2", -// "--rowIDColumn", "0", -// "--filterColumn", "1")) -// -// val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable -// tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens -// val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable -// tokenize(crossSimilarityLines) should contain theSameElementsAs CrossSimilarityTokens -// -// } -// -// test("ItemSimilarityDriver, two inputs of different dimensions") { -// -// val InFile1 = TmpDir + "in-file1.csv/" //using part files, not single file -// val InFile2 = TmpDir + "in-file2.csv/" //using part files, not single file -// val OutPath = TmpDir + "similarity-matrices/" -// -// val lines = Array( -// "u1,purchase,iphone", -// "u1,purchase,ipad", -// "u2,purchase,nexus", -// "u2,purchase,galaxy", -// // remove one user so A'B will be of different dimensions -// // ItemSimilarityDriver should create one unified user dictionary and so account for this -// // discrepancy as a blank row: "u3,purchase,surface", -// "u4,purchase,iphone", -// "u4,purchase,galaxy", -// "u1,view,iphone", -// "u1,view,ipad", -// "u1,view,nexus", -// "u1,view,galaxy", -// "u2,view,iphone", -// "u2,view,ipad", -// "u2,view,nexus", -// "u2,view,galaxy", -// "u3,view,surface", -// "u3,view,nexus", -// "u4,view,iphone", -// "u4,view,ipad", -// "u4,view,galaxy") -// -// val UnequalDimensionsSelfSimilarity = tokenize(Iterable( -// "ipad\tiphone:1.7260924347106847", -// "iphone\tipad:1.7260924347106847", -// "nexus\tgalaxy:1.7260924347106847", -// "galaxy\tnexus:1.7260924347106847")) -// -// //only surface purchase was removed so no cross-similarity for surface -// val UnequalDimensionsCrossSimilarity = tokenize(Iterable( -// "galaxy\tgalaxy:1.7260924347106847 iphone:1.7260924347106847 ipad:1.7260924347106847 nexus:1.7260924347106847", -// "iphone\tgalaxy:1.7260924347106847 iphone:1.7260924347106847 ipad:1.7260924347106847 nexus:1.7260924347106847", -// "ipad\tgalaxy:0.6795961471815897 iphone:0.6795961471815897 ipad:0.6795961471815897 nexus:0.6795961471815897", -// "nexus\tiphone:0.6795961471815897 ipad:0.6795961471815897 nexus:0.6795961471815897 galaxy:0.6795961471815897")) -// // this will create multiple part-xxxxx files in the InFile dir but other tests will -// // take account of one actual file -// val linesRdd1 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile1) -// val linesRdd2 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile2) -// -// // local multi-threaded Spark with default HDFS -// ItemSimilarityDriver.main(Array( -// "--input", InFile1, -// "--input2", InFile2, -// "--output", OutPath, -// "--master", masterUrl, -// "--filter1", "purchase", -// "--filter2", "view", -// "--inDelim", ",", -// "--itemIDColumn", "2", -// "--rowIDColumn", "0", -// "--filterColumn", "1")) -// -// val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable -// val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable -// tokenize(similarityLines) should contain theSameElementsAs UnequalDimensionsSelfSimilarity -// tokenize(crossSimilarityLines) should contain theSameElementsAs UnequalDimensionsCrossSimilarity -// -// } -// -// test("ItemSimilarityDriver cross similarity two separate items spaces") { -// /* cross-similarity with category views, same user space -// phones tablets mobile_acc soap -// u1 0 1 1 0 -// u2 1 1 1 0 -// u3 0 0 1 0 -// u4 1 1 0 1 -// */ -// val InFile1 = TmpDir + "in-file1.csv/" //using part files, not single file -// val InFile2 = TmpDir + "in-file2.csv/" //using part files, not single file -// val OutPath = TmpDir + "similarity-matrices/" -// -// val lines = Array( -// "u1,purchase,iphone", -// "u1,purchase,ipad", -// "u2,purchase,nexus", -// "u2,purchase,galaxy", -// "u3,purchase,surface", -// "u4,purchase,iphone", -// "u4,purchase,galaxy", -// "u1,view,phones", -// "u1,view,mobile_acc", -// "u2,view,phones", -// "u2,view,tablets", -// "u2,view,mobile_acc", -// "u3,view,mobile_acc", -// "u4,view,phones", -// "u4,view,tablets", -// "u4,view,soap") -// -// val UnequalDimensionsCrossSimilarityLines = tokenize(Iterable( -// "iphone\tmobile_acc:1.7260924347106847 soap:1.7260924347106847 phones:1.7260924347106847", -// "surface\tmobile_acc:0.6795961471815897", -// "nexus\ttablets:1.7260924347106847 mobile_acc:0.6795961471815897 phones:0.6795961471815897", -// "galaxy\ttablets:5.545177444479561 soap:1.7260924347106847 phones:1.7260924347106847 " + -// "mobile_acc:1.7260924347106847", -// "ipad\tmobile_acc:0.6795961471815897 phones:0.6795961471815897")) -// -// // this will create multiple part-xxxxx files in the InFile dir but other tests will -// // take account of one actual file -// val linesRdd1 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile1) -// val linesRdd2 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile2) -// -// // local multi-threaded Spark with default HDFS -// ItemSimilarityDriver.main(Array( -// "--input", InFile1, -// "--input2", InFile2, -// "--output", OutPath, -// "--master", masterUrl, -// "--filter1", "purchase", -// "--filter2", "view", -// "--inDelim", ",", -// "--itemIDColumn", "2", -// "--rowIDColumn", "0", -// "--filterColumn", "1", -// "--writeAllDatasets")) -// -// val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable -// val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable -// tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens -// tokenize(crossSimilarityLines) should contain theSameElementsAs UnequalDimensionsCrossSimilarityLines -// -// } -// -// test("A.t %*% B after changing row cardinality of A") { -// // todo: move to math tests but this is Spark specific -// -// val a = dense( -// (1.0, 1.0)) -// -// val b = dense( -// (1.0, 1.0), -// (1.0, 1.0), -// (1.0, 1.0)) -// -// val inCoreABiggertBAnswer = dense( -// (1.0, 1.0), -// (1.0, 1.0)) -// -// val drmA = drmParallelize(m = a, numPartitions = 2) -// val drmB = drmParallelize(m = b, numPartitions = 2) -// -// // modified to return a new CheckpointedDrm so maintains immutability but still only increases the row cardinality -// // by returning new CheckpointedDrmSpark[K](rdd, n, ncol, _cacheStorageLevel ) Hack for now. -// val drmABigger = drmWrap[Int](drmA.rdd, 3, 2) -// -// -// val ABiggertB = drmABigger.t %*% drmB -// val inCoreABiggertB = ABiggertB.collect -// -// assert(inCoreABiggertB === inCoreABiggertBAnswer) -// -// val bp = 0 -// } -// -// test("Changing row cardinality of an IndexedDataset") { -// -// val a = dense( -// (1.0, 1.0)) -// -// val drmA = drmParallelize(m = a, numPartitions = 2) -// val emptyIDs = new BiDictionary(new HashMap[String, Int]()) -// val indexedDatasetA = new IndexedDatasetSpark(drmA, emptyIDs, emptyIDs) -// val biggerIDSA = indexedDatasetA.newRowCardinality(5) -// -// assert(biggerIDSA.matrix.nrow == 5) -// -// } -// -// test("ItemSimilarityDriver cross similarity two separate items spaces, missing rows in B") { -// /* cross-similarity with category views, same user space -// phones tablets mobile_acc soap -// u1 0 1 1 0 -// u2 1 1 1 0 -//removed ==> u3 0 0 1 0 -// u4 1 1 0 1 -// */ -// val InFile1 = TmpDir + "in-file1.csv/" //using part files, not single file -// val InFile2 = TmpDir + "in-file2.csv/" //using part files, not single file -// val OutPath = TmpDir + "similarity-matrices/" -// -// val lines = Array( -// "u1,purchase,iphone", -// "u1,purchase,ipad", -// "u2,purchase,nexus", -// "u2,purchase,galaxy", -// "u3,purchase,surface", -// "u4,purchase,iphone", -// "u4,purchase,galaxy", -// "u1,view,phones", -// "u1,view,mobile_acc", -// "u2,view,phones", -// "u2,view,tablets", -// "u2,view,mobile_acc", -// //"u3,view,mobile_acc",// if this line is removed the cross-cooccurrence should work -// "u4,view,phones", -// "u4,view,tablets", -// "u4,view,soap") -// -// val UnequalDimensionsCrossSimilarityLines = tokenize(Iterable( -// "galaxy\ttablets:5.545177444479561 soap:1.7260924347106847 phones:1.7260924347106847", -// "ipad\tmobile_acc:1.7260924347106847 phones:0.6795961471815897", -// "surface", -// "nexus\tmobile_acc:1.7260924347106847 tablets:1.7260924347106847 phones:0.6795961471815897", -// "iphone\tsoap:1.7260924347106847 phones:1.7260924347106847")) -// -// // this will create multiple part-xxxxx files in the InFile dir but other tests will -// // take account of one actual file -// val linesRdd1 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile1) -// val linesRdd2 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile2) -// -// // local multi-threaded Spark with default HDFS -// ItemSimilarityDriver.main(Array( -// "--input", InFile1, -// "--input2", InFile2, -// "--output", OutPath, -// "--master", masterUrl, -// "--filter1", "purchase", -// "--filter2", "view", -// "--inDelim", ",", -// "--itemIDColumn", "2", -// "--rowIDColumn", "0", -// "--filterColumn", "1", -// "--writeAllDatasets")) -// -// val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable -// val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable -// tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens -// tokenize(crossSimilarityLines) should contain theSameElementsAs UnequalDimensionsCrossSimilarityLines -// } -// -// test("ItemSimilarityDriver cross similarity two separate items spaces, adding rows in B") { -// /* cross-similarity with category views, same user space -// phones tablets mobile_acc soap -// u1 0 1 1 0 -// u2 1 1 1 0 -//removed ==> u3 0 0 1 0 -// u4 1 1 0 1 -// */ -// val InFile1 = TmpDir + "in-file1.csv/" //using part files, not single file -// val InFile2 = TmpDir + "in-file2.csv/" //using part files, not single file -// val OutPath = TmpDir + "similarity-matrices/" -// -// val lines = Array( -// "u1,purchase,iphone", -// "u1,purchase,ipad", -// "u2,purchase,nexus", -// "u2,purchase,galaxy", -// "u3,purchase,surface", -// "u4,purchase,iphone", -// "u4,purchase,galaxy", -// "u1,view,phones", -// "u1,view,mobile_acc", -// "u2,view,phones", -// "u2,view,tablets", -// "u2,view,mobile_acc", -// "u3,view,mobile_acc",// if this line is removed the cross-cooccurrence should work -// "u4,view,phones", -// "u4,view,tablets", -// "u4,view,soap", -// "u5,view,soap") -// -// val UnequalDimensionsSimilarityTokens = List( -// "galaxy", -// "nexus:2.231435513142097", -// "iphone:0.13844293808390518", -// "nexus", -// "galaxy:2.231435513142097", -// "ipad", -// "iphone:2.231435513142097", -// "surface", -// "iphone", -// "ipad:2.231435513142097", -// "galaxy:0.13844293808390518") -// -// val UnequalDimensionsCrossSimilarityLines = List( -// "galaxy", -// "tablets:6.730116670092563", -// "phones:2.9110316603236868", -// "soap:0.13844293808390518", -// "mobile_acc:0.13844293808390518", -// "nexus", -// "tablets:2.231435513142097", -// "mobile_acc:1.184939225613002", -// "phones:1.184939225613002", -// "ipad", "mobile_acc:1.184939225613002", -// "phones:1.184939225613002", -// "surface", -// "mobile_acc:1.184939225613002", -// "iphone", -// "phones:2.9110316603236868", -// "soap:0.13844293808390518", -// "tablets:0.13844293808390518", -// "mobile_acc:0.13844293808390518") -// -// // this will create multiple part-xxxxx files in the InFile dir but other tests will -// // take account of one actual file -// val linesRdd1 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile1) -// val linesRdd2 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile2) -// -// // local multi-threaded Spark with default HDFS -// ItemSimilarityDriver.main(Array( -// "--input", InFile1, -// "--input2", InFile2, -// "--output", OutPath, -// "--master", masterUrl, -// "--filter1", "purchase", -// "--filter2", "view", -// "--inDelim", ",", -// "--itemIDColumn", "2", -// "--rowIDColumn", "0", -// "--filterColumn", "1", -// "--writeAllDatasets")) -// -// val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable -// val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable -// tokenize(similarityLines) should contain theSameElementsAs UnequalDimensionsSimilarityTokens -// tokenize(crossSimilarityLines) should contain theSameElementsAs UnequalDimensionsCrossSimilarityLines -// } -// -// // convert into an Iterable of tokens for 'should contain theSameElementsAs Iterable' -// def tokenize(a: Iterable[String]): Iterable[String] = { -// var r: Iterable[String] = Iterable() -// a.foreach { l => -// l.split("\t").foreach { s => -// r = r ++ s.split("[\t ]") -// } -// } -// r -// } -// -// override protected def beforeAll(configMap: ConfigMap) { -// super.beforeAll(configMap) -// ItemSimilarityDriver.useContext(mahoutCtx) -// } -// -//} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.drivers + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{Path, FileSystem} +import org.apache.mahout.math.indexeddataset.{BiDictionary, IndexedDataset} +import org.apache.mahout.sparkbindings.indexeddataset.IndexedDatasetSpark +import org.scalatest.{ConfigMap, FunSuite} +import org.apache.mahout.sparkbindings._ +import org.apache.mahout.sparkbindings.test.DistributedSparkSuite +import org.apache.mahout.math.drm._ +import org.apache.mahout.math.scalabindings._ + +import scala.collection.immutable.HashMap + +//todo: take out, only for temp tests + +import org.apache.mahout.math.scalabindings._ +import RLikeOps._ +import org.apache.mahout.math.drm._ +import RLikeDrmOps._ +import scala.collection.JavaConversions._ + + +class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite { + + /* + final val matrixLLRCoocAtAControl = dense( + (0.0, 0.6331745808516107, 0.0, 0.0, 0.0), + (0.6331745808516107, 0.0, 0.0, 0.0, 0.0), + (0.0, 0.0, 0.0, 0.6331745808516107, 0.0), + (0.0, 0.0, 0.6331745808516107, 0.0, 0.0), + (0.0, 0.0, 0.0, 0.0, 0.0)) + + final val matrixLLRCoocBtAControl = dense( + (1.7260924347106847, 1.7260924347106847, 1.7260924347106847, 1.7260924347106847, 0.0), + (0.6795961471815897, 0.6795961471815897, 0.6795961471815897, 0.6795961471815897, 0.0), + (0.6795961471815897, 0.6795961471815897, 0.6795961471815897, 0.6795961471815897, 0.0), + (1.7260924347106847, 1.7260924347106847, 1.7260924347106847, 1.7260924347106847, 0.0), + (0.0, 0.0, 0.6795961471815897, 0.0, 4.498681156950466)) + */ + + + final val SelfSimilairtyLines = Iterable( + "galaxy\tnexus:1.7260924347106847", + "ipad\tiphone:1.7260924347106847", + "nexus\tgalaxy:1.7260924347106847", + "iphone\tipad:1.7260924347106847", + "surface") + + val CrossSimilarityLines = Iterable( + "iphone\tnexus:1.7260924347106847 iphone:1.7260924347106847 ipad:1.7260924347106847 galaxy:1.7260924347106847", + "ipad\tnexus:0.6795961471815897 iphone:0.6795961471815897 ipad:0.6795961471815897 galaxy:0.6795961471815897", + "nexus\tnexus:0.6795961471815897 iphone:0.6795961471815897 ipad:0.6795961471815897 galaxy:0.6795961471815897", + "galaxy\tnexus:1.7260924347106847 iphone:1.7260924347106847 ipad:1.7260924347106847 galaxy:1.7260924347106847", + "surface\tsurface:4.498681156950466 nexus:0.6795961471815897") + + // todo: a better test would be to sort each vector by itemID and compare rows, tokens misses some error cases + final val SelfSimilairtyTokens = tokenize(Iterable( + "galaxy\tnexus:1.7260924347106847", + "ipad\tiphone:1.7260924347106847", + "nexus\tgalaxy:1.7260924347106847", + "iphone\tipad:1.7260924347106847", + "surface")) + + val CrossSimilarityTokens = tokenize(Iterable( + "iphone\tnexus:1.7260924347106847 iphone:1.7260924347106847 ipad:1.7260924347106847 galaxy:1.7260924347106847", + "ipad\tnexus:0.6795961471815897 iphone:0.6795961471815897 ipad:0.6795961471815897 galaxy:0.6795961471815897", + "nexus\tnexus:0.6795961471815897 iphone:0.6795961471815897 ipad:0.6795961471815897 galaxy:0.6795961471815897", + "galaxy\tnexus:1.7260924347106847 iphone:1.7260924347106847 ipad:1.7260924347106847 galaxy:1.7260924347106847", + "surface\tsurface:4.498681156950466 nexus:0.6795961471815897")) + + /* + //Clustered Spark and HDFS, not a good everyday build test + ItemSimilarityDriver.main(Array( + "--input", "hdfs://occam4:54310/user/pat/spark-itemsimilarity/cf-data.txt", + "--output", "hdfs://occam4:54310/user/pat/spark-itemsimilarity/similarityMatrices/", + "--master", "spark://occam4:7077", + "--filter1", "purchase", + "--filter2", "view", + "--inDelim", ",", + "--itemIDColumn", "2", + "--rowIDColumn", "0", + "--filterColumn", "1")) + */ + // local multi-threaded Spark with HDFS using large dataset + // not a good build test. + /* + ItemSimilarityDriver.main(Array( + "--input", "hdfs://occam4:54310/user/pat/xrsj/ratings_data.txt", + "--output", "hdfs://occam4:54310/user/pat/xrsj/similarityMatrices/", + "--master", "local[4]", + "--filter1", "purchase", + "--filter2", "view", + "--inDelim", ",", + "--itemIDColumn", "2", + "--rowIDColumn", "0", + "--filterColumn", "1")) + */ + + test("ItemSimilarityDriver, non-full-spec CSV") { + + val InFile = TmpDir + "in-file.csv/" //using part files, not single file + val OutPath = TmpDir + "similarity-matrices/" + + val lines = Array( + "u1,purchase,iphone", + "u1,purchase,ipad", + "u2,purchase,nexus", + "u2,purchase,galaxy", + "u3,purchase,surface", + "u4,purchase,iphone", + "u4,purchase,galaxy", + "u1,view,iphone", + "u1,view,ipad", + "u1,view,nexus", + "u1,view,galaxy", + "u2,view,iphone", + "u2,view,ipad", + "u2,view,nexus", + "u2,view,galaxy", + "u3,view,surface", + "u3,view,nexus", + "u4,view,iphone", + "u4,view,ipad", + "u4,view,galaxy") + + // this will create multiple part-xxxxx files in the InFile dir but other tests will + // take account of one actual file + val linesRdd = mahoutCtx.parallelize(lines).saveAsTextFile(InFile) + + // local multi-threaded Spark with default HDFS + ItemSimilarityDriver.main(Array( + "--input", InFile, + "--output", OutPath, + "--master", masterUrl, + "--filter1", "purchase", + "--filter2", "view", + "--inDelim", ",", + "--itemIDColumn", "2", + "--rowIDColumn", "0", + "--filterColumn", "1", + "--writeAllDatasets")) + + // todo: these comparisons rely on a sort producing the same lines, which could possibly + // fail since the sort is on value and these can be the same for all items in a vector + val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable + tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens + val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable + tokenize(crossSimilarityLines) should contain theSameElementsAs CrossSimilarityTokens + } + + + + test("ItemSimilarityDriver TSV ") { + + val InFile = TmpDir + "in-file.tsv/" + val OutPath = TmpDir + "similarity-matrices/" + + val lines = Array( + "u1\tpurchase\tiphone", + "u1\tpurchase\tipad", + "u2\tpurchase\tnexus", + "u2\tpurchase\tgalaxy", + "u3\tpurchase\tsurface", + "u4\tpurchase\tiphone", + "u4\tpurchase\tgalaxy", + "u1\tview\tiphone", + "u1\tview\tipad", + "u1\tview\tnexus", + "u1\tview\tgalaxy", + "u2\tview\tiphone", + "u2\tview\tipad", + "u2\tview\tnexus", + "u2\tview\tgalaxy", + "u3\tview\tsurface", + "u3\tview\tnexus", + "u4\tview\tiphone", + "u4\tview\tipad", + "u4\tview\tgalaxy") + + // this will create multiple part-xxxxx files in the InFile dir but other tests will + // take account of one actual file + val linesRdd = mahoutCtx.parallelize(lines).saveAsTextFile(InFile) + + // local multi-threaded Spark with default HDFS + ItemSimilarityDriver.main(Array( + "--input", InFile, + "--output", OutPath, + "--master", masterUrl, + "--filter1", "purchase", + "--filter2", "view", + "--inDelim", "[,\t]", + "--itemIDColumn", "2", + "--rowIDColumn", "0", + "--filterColumn", "1")) + + // todo: a better test would be to get sorted vectors and compare rows instead of tokens, this might miss + // some error cases + val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable + tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens + val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable + tokenize(crossSimilarityLines) should contain theSameElementsAs CrossSimilarityTokens + + } + + test("ItemSimilarityDriver log-ish files") { + + val InFile = TmpDir + "in-file.log/" + val OutPath = TmpDir + "similarity-matrices/" + + val lines = Array( + "2014-06-23 14:46:53.115\tu1\tpurchase\trandom text\tiphone", + "2014-06-23 14:46:53.115\tu1\tpurchase\trandom text\tipad", + "2014-06-23 14:46:53.115\tu2\tpurchase\trandom text\tnexus", + "2014-06-23 14:46:53.115\tu2\tpurchase\trandom text\tgalaxy", + "2014-06-23 14:46:53.115\tu3\tpurchase\trandom text\tsurface", + "2014-06-23 14:46:53.115\tu4\tpurchase\trandom text\tiphone", + "2014-06-23 14:46:53.115\tu4\tpurchase\trandom text\tgalaxy", + "2014-06-23 14:46:53.115\tu1\tview\trandom text\tiphone", + "2014-06-23 14:46:53.115\tu1\tview\trandom text\tipad", + "2014-06-23 14:46:53.115\tu1\tview\trandom text\tnexus", + "2014-06-23 14:46:53.115\tu1\tview\trandom text\tgalaxy", + "2014-06-23 14:46:53.115\tu2\tview\trandom text\tiphone", + "2014-06-23 14:46:53.115\tu2\tview\trandom text\tipad", + "2014-06-23 14:46:53.115\tu2\tview\trandom text\tnexus", + "2014-06-23 14:46:53.115\tu2\tview\trandom text\tgalaxy", + "2014-06-23 14:46:53.115\tu3\tview\trandom text\tsurface", + "2014-06-23 14:46:53.115\tu3\tview\trandom text\tnexus", + "2014-06-23 14:46:53.115\tu4\tview\trandom text\tiphone", + "2014-06-23 14:46:53.115\tu4\tview\trandom text\tipad", + "2014-06-23 14:46:53.115\tu4\tview\trandom text\tgalaxy") + + // this will create multiple part-xxxxx files in the InFile dir but other tests will + // take account of one actual file + val linesRdd = mahoutCtx.parallelize(lines).saveAsTextFile(InFile) + + // local multi-threaded Spark with default HDFS + ItemSimilarityDriver.main(Array( + "--input", InFile, + "--output", OutPath, + "--master", masterUrl, + "--filter1", "purchase", + "--filter2", "view", + "--inDelim", "\t", + "--itemIDColumn", "4", + "--rowIDColumn", "1", + "--filterColumn", "2")) + + + val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable + tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens + val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable + tokenize(crossSimilarityLines) should contain theSameElementsAs CrossSimilarityTokens + + } + + test("ItemSimilarityDriver legacy supported file format") { + + val InDir = TmpDir + "in-dir/" + val InFilename = "in-file.tsv" + val InPath = InDir + InFilename + + val OutPath = TmpDir + "similarity-matrices" + + val lines = Array( + "0,0,1", + "0,1,1", + "1,2,1", + "1,3,1", + "2,4,1", + "3,0,1", + "3,3,1") + + val Answer = tokenize(Iterable( + "0\t1:1.7260924347106847", + "3\t2:1.7260924347106847", + "1\t0:1.7260924347106847", + "4", + "2\t3:1.7260924347106847")) + + // this creates one part-0000 file in the directory + mahoutCtx.parallelize(lines).coalesce(1, shuffle = true).saveAsTextFile(InDir) + + // to change from using part files to a single .tsv file we'll need to use HDFS + val fs = FileSystem.get(new Configuration()) + //rename part-00000 to something.tsv + fs.rename(new Path(InDir + "part-00000"), new Path(InPath)) + + // local multi-threaded Spark with default HDFS + ItemSimilarityDriver.main(Array( + "--input", InPath, + "--output", OutPath, + "--master", masterUrl)) + + val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable + tokenize(similarityLines) should contain theSameElementsAs Answer + + } + + test("ItemSimilarityDriver write search engine output") { + + val InDir = TmpDir + "in-dir/" + val InFilename = "in-file.tsv" + val InPath = InDir + InFilename + + val OutPath = TmpDir + "similarity-matrices" + + val lines = Array( + "0,0,1", + "0,1,1", + "1,2,1", + "1,3,1", + "2,4,1", + "3,0,1", + "3,3,1") + + val Answer = tokenize(Iterable( + "0\t1", + "3\t2", + "1\t0", + "4", + "2\t3")) + + // this creates one part-0000 file in the directory + mahoutCtx.parallelize(lines).coalesce(1, shuffle = true).saveAsTextFile(InDir) + + // to change from using part files to a single .tsv file we'll need to use HDFS + val fs = FileSystem.get(new Configuration()) + //rename part-00000 to something.tsv + fs.rename(new Path(InDir + "part-00000"), new Path(InPath)) + + // local multi-threaded Spark with default HDFS + ItemSimilarityDriver.main(Array( + "--input", InPath, + "--output", OutPath, + "--master", masterUrl, + "--omitStrength")) + + val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable + tokenize(similarityLines) should contain theSameElementsAs Answer + + } + + test("ItemSimilarityDriver recursive file discovery using filename patterns") { + //directory structure using the following + // tmp/data/m1.tsv + // tmp/data/more-data/another-dir/m2.tsv + val M1Lines = Array( + "u1\tpurchase\tiphone", + "u1\tpurchase\tipad", + "u2\tpurchase\tnexus", + "u2\tpurchase\tgalaxy", + "u3\tpurchase\tsurface", + "u4\tpurchase\tiphone", + "u4\tpurchase\tgalaxy") + + val M2Lines = Array( + "u1\tview\tiphone", + "u1\tview\tipad", + "u1\tview\tnexus", + "u1\tview\tgalaxy", + "u2\tview\tiphone", + "u2\tview\tipad", + "u2\tview\tnexus", + "u2\tview\tgalaxy", + "u3\tview\tsurface", + "u3\tview\tnexus", + "u4\tview\tiphone", + "u4\tview\tipad", + "u4\tview\tgalaxy") + + val InFilenameM1 = "m1.tsv" + val InDirM1 = TmpDir + "data/" + val InPathM1 = InDirM1 + InFilenameM1 + val InFilenameM2 = "m2.tsv" + val InDirM2 = TmpDir + "data/more-data/another-dir/" + val InPathM2 = InDirM2 + InFilenameM2 + + val InPathStart = TmpDir + "data/" + val OutPath = TmpDir + "similarity-matrices" + + // this creates one part-0000 file in the directory + mahoutCtx.parallelize(M1Lines).coalesce(1, shuffle = true).saveAsTextFile(InDirM1) + + // to change from using part files to a single .tsv file we'll need to use HDFS + val fs = FileSystem.get(new Configuration()) + //rename part-00000 to something.tsv + fs.rename(new Path(InDirM1 + "part-00000"), new Path(InPathM1)) + + // this creates one part-0000 file in the directory + mahoutCtx.parallelize(M2Lines).coalesce(1, shuffle = true).saveAsTextFile(InDirM2) + + // to change from using part files to a single .tsv file we'll need to use HDFS + //rename part-00000 to tmp/some-location/something.tsv + fs.rename(new Path(InDirM2 + "part-00000"), new Path(InPathM2)) + + // local multi-threaded Spark with default FS, suitable for build tests but need better location for data + + ItemSimilarityDriver.main(Array( + "--input", InPathStart, + "--output", OutPath, + "--master", masterUrl, + "--filter1", "purchase", + "--filter2", "view", + "--inDelim", "\t", + "--itemIDColumn", "2", + "--rowIDColumn", "0", + "--filterColumn", "1", + "--filenamePattern", "m..tsv", + "--recursive")) + + val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable + tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens + val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable + tokenize(crossSimilarityLines) should contain theSameElementsAs CrossSimilarityTokens + + } + + test("ItemSimilarityDriver, two input paths") { + + val InFile1 = TmpDir + "in-file1.csv/" //using part files, not single file + val InFile2 = TmpDir + "in-file2.csv/" //using part files, not single file + val OutPath = TmpDir + "similarity-matrices/" + + val lines = Array( + "u1,purchase,iphone", + "u1,purchase,ipad", + "u2,purchase,nexus", + "u2,purchase,galaxy", + "u3,purchase,surface", + "u4,purchase,iphone", + "u4,purchase,galaxy", + "u1,view,iphone", + "u1,view,ipad", + "u1,view,nexus", + "u1,view,galaxy", + "u2,view,iphone", + "u2,view,ipad", + "u2,view,nexus", + "u2,view,galaxy", + "u3,view,surface", + "u3,view,nexus", + "u4,view,iphone", + "u4,view,ipad", + "u4,view,galaxy") + + // this will create multiple part-xxxxx files in the InFile dir but other tests will + // take account of one actual file + val linesRdd1 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile1) + val linesRdd2 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile2) + + // local multi-threaded Spark with default HDFS + ItemSimilarityDriver.main(Array( + "--input", InFile1, + "--input2", InFile2, + "--output", OutPath, + "--master", masterUrl, + "--filter1", "purchase", + "--filter2", "view", + "--inDelim", ",", + "--itemIDColumn", "2", + "--rowIDColumn", "0", + "--filterColumn", "1")) + + val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable + tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens + val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable + tokenize(crossSimilarityLines) should contain theSameElementsAs CrossSimilarityTokens + + } + + test("ItemSimilarityDriver, two inputs of different dimensions") { + + val InFile1 = TmpDir + "in-file1.csv/" //using part files, not single file + val InFile2 = TmpDir + "in-file2.csv/" //using part files, not single file + val OutPath = TmpDir + "similarity-matrices/" + + val lines = Array( + "u1,purchase,iphone", + "u1,purchase,ipad", + "u2,purchase,nexus", + "u2,purchase,galaxy", + // remove one user so A'B will be of different dimensions + // ItemSimilarityDriver should create one unified user dictionary and so account for this + // discrepancy as a blank row: "u3,purchase,surface", + "u4,purchase,iphone", + "u4,purchase,galaxy", + "u1,view,iphone", + "u1,view,ipad", + "u1,view,nexus", + "u1,view,galaxy", + "u2,view,iphone", + "u2,view,ipad", + "u2,view,nexus", + "u2,view,galaxy", + "u3,view,surface", + "u3,view,nexus", + "u4,view,iphone", + "u4,view,ipad", + "u4,view,galaxy") + + val UnequalDimensionsSelfSimilarity = tokenize(Iterable( + "ipad\tiphone:1.7260924347106847", + "iphone\tipad:1.7260924347106847", + "nexus\tgalaxy:1.7260924347106847", + "galaxy\tnexus:1.7260924347106847")) + + //only surface purchase was removed so no cross-similarity for surface + val UnequalDimensionsCrossSimilarity = tokenize(Iterable( + "galaxy\tgalaxy:1.7260924347106847 iphone:1.7260924347106847 ipad:1.7260924347106847 nexus:1.7260924347106847", + "iphone\tgalaxy:1.7260924347106847 iphone:1.7260924347106847 ipad:1.7260924347106847 nexus:1.7260924347106847", + "ipad\tgalaxy:0.6795961471815897 iphone:0.6795961471815897 ipad:0.6795961471815897 nexus:0.6795961471815897", + "nexus\tiphone:0.6795961471815897 ipad:0.6795961471815897 nexus:0.6795961471815897 galaxy:0.6795961471815897")) + // this will create multiple part-xxxxx files in the InFile dir but other tests will + // take account of one actual file + val linesRdd1 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile1) + val linesRdd2 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile2) + + // local multi-threaded Spark with default HDFS + ItemSimilarityDriver.main(Array( + "--input", InFile1, + "--input2", InFile2, + "--output", OutPath, + "--master", masterUrl, + "--filter1", "purchase", + "--filter2", "view", + "--inDelim", ",", + "--itemIDColumn", "2", + "--rowIDColumn", "0", + "--filterColumn", "1")) + + val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable + val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable + tokenize(similarityLines) should contain theSameElementsAs UnequalDimensionsSelfSimilarity + tokenize(crossSimilarityLines) should contain theSameElementsAs UnequalDimensionsCrossSimilarity + + } + + test("ItemSimilarityDriver cross similarity two separate items spaces") { + /* cross-similarity with category views, same user space + phones tablets mobile_acc soap + u1 0 1 1 0 + u2 1 1 1 0 + u3 0 0 1 0 + u4 1 1 0 1 + */ + val InFile1 = TmpDir + "in-file1.csv/" //using part files, not single file + val InFile2 = TmpDir + "in-file2.csv/" //using part files, not single file + val OutPath = TmpDir + "similarity-matrices/" + + val lines = Array( + "u1,purchase,iphone", + "u1,purchase,ipad", + "u2,purchase,nexus", + "u2,purchase,galaxy", + "u3,purchase,surface", + "u4,purchase,iphone", + "u4,purchase,galaxy", + "u1,view,phones", + "u1,view,mobile_acc", + "u2,view,phones", + "u2,view,tablets", + "u2,view,mobile_acc", + "u3,view,mobile_acc", + "u4,view,phones", + "u4,view,tablets", + "u4,view,soap") + + val UnequalDimensionsCrossSimilarityLines = tokenize(Iterable( + "iphone\tmobile_acc:1.7260924347106847 soap:1.7260924347106847 phones:1.7260924347106847", + "surface\tmobile_acc:0.6795961471815897", + "nexus\ttablets:1.7260924347106847 mobile_acc:0.6795961471815897 phones:0.6795961471815897", + "galaxy\ttablets:5.545177444479561 soap:1.7260924347106847 phones:1.7260924347106847 " + + "mobile_acc:1.7260924347106847", + "ipad\tmobile_acc:0.6795961471815897 phones:0.6795961471815897")) + + // this will create multiple part-xxxxx files in the InFile dir but other tests will + // take account of one actual file + val linesRdd1 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile1) + val linesRdd2 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile2) + + // local multi-threaded Spark with default HDFS + ItemSimilarityDriver.main(Array( + "--input", InFile1, + "--input2", InFile2, + "--output", OutPath, + "--master", masterUrl, + "--filter1", "purchase", + "--filter2", "view", + "--inDelim", ",", + "--itemIDColumn", "2", + "--rowIDColumn", "0", + "--filterColumn", "1", + "--writeAllDatasets")) + + val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable + val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable + tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens + tokenize(crossSimilarityLines) should contain theSameElementsAs UnequalDimensionsCrossSimilarityLines + + } + + test("A.t %*% B after changing row cardinality of A") { + // todo: move to math tests but this is Spark specific + + val a = dense( + (1.0, 1.0)) + + val b = dense( + (1.0, 1.0), + (1.0, 1.0), + (1.0, 1.0)) + + val inCoreABiggertBAnswer = dense( + (1.0, 1.0), + (1.0, 1.0)) + + val drmA = drmParallelize(m = a, numPartitions = 2) + val drmB = drmParallelize(m = b, numPartitions = 2) + + // modified to return a new CheckpointedDrm so maintains immutability but still only increases the row cardinality + // by returning new CheckpointedDrmSpark[K](rdd, n, ncol, _cacheStorageLevel ) Hack for now. + val drmABigger = drmWrap[Int](drmA.rdd, 3, 2) + + + val ABiggertB = drmABigger.t %*% drmB + val inCoreABiggertB = ABiggertB.collect + + assert(inCoreABiggertB === inCoreABiggertBAnswer) + + val bp = 0 + } + + test("Changing row cardinality of an IndexedDataset") { + + val a = dense( + (1.0, 1.0)) + + val drmA = drmParallelize(m = a, numPartitions = 2) + val emptyIDs = new BiDictionary(new HashMap[String, Int]()) + val indexedDatasetA = new IndexedDatasetSpark(drmA, emptyIDs, emptyIDs) + val biggerIDSA = indexedDatasetA.newRowCardinality(5) + + assert(biggerIDSA.matrix.nrow == 5) + + } + + test("ItemSimilarityDriver cross similarity two separate items spaces, missing rows in B") { + /* cross-similarity with category views, same user space + phones tablets mobile_acc soap + u1 0 1 1 0 + u2 1 1 1 0 +removed ==> u3 0 0 1 0 + u4 1 1 0 1 + */ + val InFile1 = TmpDir + "in-file1.csv/" //using part files, not single file + val InFile2 = TmpDir + "in-file2.csv/" //using part files, not single file + val OutPath = TmpDir + "similarity-matrices/" + + val lines = Array( + "u1,purchase,iphone", + "u1,purchase,ipad", + "u2,purchase,nexus", + "u2,purchase,galaxy", + "u3,purchase,surface", + "u4,purchase,iphone", + "u4,purchase,galaxy", + "u1,view,phones", + "u1,view,mobile_acc", + "u2,view,phones", + "u2,view,tablets", + "u2,view,mobile_acc", + //"u3,view,mobile_acc",// if this line is removed the cross-cooccurrence should work + "u4,view,phones", + "u4,view,tablets", + "u4,view,soap") + + val UnequalDimensionsCrossSimilarityLines = tokenize(Iterable( + "galaxy\ttablets:5.545177444479561 soap:1.7260924347106847 phones:1.7260924347106847", + "ipad\tmobile_acc:1.7260924347106847 phones:0.6795961471815897", + "surface", + "nexus\tmobile_acc:1.7260924347106847 tablets:1.7260924347106847 phones:0.6795961471815897", + "iphone\tsoap:1.7260924347106847 phones:1.7260924347106847")) + + // this will create multiple part-xxxxx files in the InFile dir but other tests will + // take account of one actual file + val linesRdd1 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile1) + val linesRdd2 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile2) + + // local multi-threaded Spark with default HDFS + ItemSimilarityDriver.main(Array( + "--input", InFile1, + "--input2", InFile2, + "--output", OutPath, + "--master", masterUrl, + "--filter1", "purchase", + "--filter2", "view", + "--inDelim", ",", + "--itemIDColumn", "2", + "--rowIDColumn", "0", + "--filterColumn", "1", + "--writeAllDatasets")) + + val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable + val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable + tokenize(similarityLines) should contain theSameElementsAs SelfSimilairtyTokens + tokenize(crossSimilarityLines) should contain theSameElementsAs UnequalDimensionsCrossSimilarityLines + } + + test("ItemSimilarityDriver cross similarity two separate items spaces, adding rows in B") { + /* cross-similarity with category views, same user space + phones tablets mobile_acc soap + u1 0 1 1 0 + u2 1 1 1 0 +removed ==> u3 0 0 1 0 + u4 1 1 0 1 + */ + val InFile1 = TmpDir + "in-file1.csv/" //using part files, not single file + val InFile2 = TmpDir + "in-file2.csv/" //using part files, not single file + val OutPath = TmpDir + "similarity-matrices/" + + val lines = Array( + "u1,purchase,iphone", + "u1,purchase,ipad", + "u2,purchase,nexus", + "u2,purchase,galaxy", + "u3,purchase,surface", + "u4,purchase,iphone", + "u4,purchase,galaxy", + "u1,view,phones", + "u1,view,mobile_acc", + "u2,view,phones", + "u2,view,tablets", + "u2,view,mobile_acc", + "u3,view,mobile_acc",// if this line is removed the cross-cooccurrence should work + "u4,view,phones", + "u4,view,tablets", + "u4,view,soap", + "u5,view,soap") + + val UnequalDimensionsSimilarityTokens = List( + "galaxy", + "nexus:2.231435513142097", + "iphone:0.13844293808390518", + "nexus", + "galaxy:2.231435513142097", + "ipad", + "iphone:2.231435513142097", + "surface", + "iphone", + "ipad:2.231435513142097", + "galaxy:0.13844293808390518") + + val UnequalDimensionsCrossSimilarityLines = List( + "galaxy", + "tablets:6.730116670092563", + "phones:2.9110316603236868", + "soap:0.13844293808390518", + "mobile_acc:0.13844293808390518", + "nexus", + "tablets:2.231435513142097", + "mobile_acc:1.184939225613002", + "phones:1.184939225613002", + "ipad", "mobile_acc:1.184939225613002", + "phones:1.184939225613002", + "surface", + "mobile_acc:1.184939225613002", + "iphone", + "phones:2.9110316603236868", + "soap:0.13844293808390518", + "tablets:0.13844293808390518", + "mobile_acc:0.13844293808390518") + + // this will create multiple part-xxxxx files in the InFile dir but other tests will + // take account of one actual file + val linesRdd1 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile1) + val linesRdd2 = mahoutCtx.parallelize(lines).saveAsTextFile(InFile2) + + // local multi-threaded Spark with default HDFS + ItemSimilarityDriver.main(Array( + "--input", InFile1, + "--input2", InFile2, + "--output", OutPath, + "--master", masterUrl, + "--filter1", "purchase", + "--filter2", "view", + "--inDelim", ",", + "--itemIDColumn", "2", + "--rowIDColumn", "0", + "--filterColumn", "1", + "--writeAllDatasets")) + + val similarityLines = mahoutCtx.textFile(OutPath + "/similarity-matrix/").collect.toIterable + val crossSimilarityLines = mahoutCtx.textFile(OutPath + "/cross-similarity-matrix/").collect.toIterable + tokenize(similarityLines) should contain theSameElementsAs UnequalDimensionsSimilarityTokens + tokenize(crossSimilarityLines) should contain theSameElementsAs UnequalDimensionsCrossSimilarityLines + } + + // convert into an Iterable of tokens for 'should contain theSameElementsAs Iterable' + def tokenize(a: Iterable[String]): Iterable[String] = { + var r: Iterable[String] = Iterable() + a.foreach { l => + l.split("\t").foreach { s => + r = r ++ s.split("[\t ]") + } + } + r + } + + override protected def beforeAll(configMap: ConfigMap) { + super.beforeAll(configMap) + ItemSimilarityDriver.useContext(mahoutCtx) + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/67f3a65c/spark/src/test/scala/org/apache/mahout/drivers/RowSimilarityDriverSuite.scala ---------------------------------------------------------------------- diff --git a/spark/src/test/scala/org/apache/mahout/drivers/RowSimilarityDriverSuite.scala b/spark/src/test/scala/org/apache/mahout/drivers/RowSimilarityDriverSuite.scala index e6f917c..eccddb1 100644 --- a/spark/src/test/scala/org/apache/mahout/drivers/RowSimilarityDriverSuite.scala +++ b/spark/src/test/scala/org/apache/mahout/drivers/RowSimilarityDriverSuite.scala @@ -1,139 +1,139 @@ -///* -// * Licensed to the Apache Software Foundation (ASF) under one or more -// * contributor license agreements. See the NOTICE file distributed with -// * this work for additional information regarding copyright ownership. -// * The ASF licenses this file to You under the Apache License, Version 2.0 -// * (the "License"); you may not use this file except in compliance with -// * the License. You may obtain a copy of the License at -// * -// * http://www.apache.org/licenses/LICENSE-2.0 -// * -// * Unless required by applicable law or agreed to in writing, software -// * distributed under the License is distributed on an "AS IS" BASIS, -// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// * See the License for the specific language governing permissions and -// * limitations under the License. -// */ -// -//package org.apache.mahout.drivers -// -//import org.apache.hadoop.conf.Configuration -//import org.apache.hadoop.fs.{FileSystem, Path} -//import org.apache.mahout.math.drm.RLikeDrmOps._ -//import org.apache.mahout.math.drm._ -//import org.apache.mahout.math.scalabindings.RLikeOps._ -//import org.apache.mahout.math.scalabindings._ -//import org.apache.mahout.sparkbindings._ -//import org.apache.mahout.sparkbindings.test.DistributedSparkSuite -//import org.scalatest.{ConfigMap, FunSuite} -// -// -//class RowSimilarityDriverSuite extends FunSuite with DistributedSparkSuite { -// -// val TextDocs = Array( -// "doc1\tNow is the time for all good people to come to aid of their party", -// "doc2\tNow is the time for all good people to come to aid of their country", -// "doc3\tNow is the time for all good people to come to aid of their hood", -// "doc4\tNow is the time for all good people to come to aid of their friends", -// "doc5\tNow is the time for all good people to come to aid of their looser brother", -// "doc6\tThe quick brown fox jumped over the lazy dog", -// "doc7\tThe quick brown fox jumped over the lazy boy", -// "doc8\tThe quick brown fox jumped over the lazy cat", -// "doc9\tThe quick brown fox jumped over the lazy wolverine", -// "doc10\tThe quick brown fox jumped over the lazy cantelope")// yes that's spelled correctly. -// -// test("RowSimilarityDriver text docs no strengths") { -// -// val firstFiveSimDocsTokens = tokenize(Iterable( -// "doc1\tdoc3 doc2 doc4 doc5")) -// -// val lastFiveSimDocsTokens = tokenize(Iterable( -// "doc6\tdoc8 doc10 doc7 doc9")) -// -// val inDir = TmpDir + "in-dir/" -// val inFilename = "in-file.tsv" -// val inPath = inDir + inFilename -// -// val outPath = TmpDir + "similarity-matrices/" -// -// -// // this creates one part-0000 file in the directory -// mahoutCtx.parallelize(TextDocs).coalesce(1, shuffle=true).saveAsTextFile(inDir) -// -// // to change from using part files to a single .tsv file we'll need to use HDFS -// val fs = FileSystem.get(new Configuration()) -// //rename part-00000 to something.tsv -// fs.rename(new Path(inDir + "part-00000"), new Path(inPath)) -// -// // local multi-threaded Spark with default HDFS -// RowSimilarityDriver.main(Array( -// "--input", inPath, -// "--output", outPath, -// "--omitStrength", -// "--maxSimilaritiesPerRow", "4", // would get all docs similar if we didn't limit them -// "--master", masterUrl)) -// -// val simLines = mahoutCtx.textFile(outPath).collect -// simLines.foreach { line => -// val lineTokens = line.split("[\t ]") -// if (lineTokens.contains("doc1") ) // docs are two flavors so if only 4 similarities it will effectively classify -// lineTokens should contain theSameElementsAs firstFiveSimDocsTokens -// else -// lineTokens should contain theSameElementsAs lastFiveSimDocsTokens -// } -// -// } -// -// test("RowSimilarityDriver text docs") { -// -// val simDocsTokens = tokenize(Iterable( -// "doc1\tdoc3:27.87301122947484 doc2:27.87301122947484 doc4:27.87301122947484 doc5:23.42278065550721", -// "doc2\tdoc4:27.87301122947484 doc3:27.87301122947484 doc1:27.87301122947484 doc5:23.42278065550721", -// "doc3\tdoc4:27.87301122947484 doc2:27.87301122947484 doc1:27.87301122947484 doc5:23.42278065550721", -// "doc4\tdoc3:27.87301122947484 doc2:27.87301122947484 doc1:27.87301122947484 doc5:23.42278065550721", -// "doc5\tdoc4:23.42278065550721 doc2:23.42278065550721 doc3:23.42278065550721 doc1:23.42278065550721", -// "doc6\tdoc8:22.936393049704463 doc10:22.936393049704463 doc7:22.936393049704463 doc9:22.936393049704463", -// "doc7\tdoc6:22.936393049704463 doc8:22.936393049704463 doc10:22.936393049704463 doc9:22.936393049704463", -// "doc8\tdoc6:22.936393049704463 doc10:22.936393049704463 doc7:22.936393049704463 doc9:22.936393049704463", -// "doc9\tdoc6:22.936393049704463 doc8:22.936393049704463 doc10:22.936393049704463 doc7:22.936393049704463", -// "doc10\tdoc6:22.936393049704463 doc8:22.936393049704463 doc7:22.936393049704463 doc9:22.936393049704463")) -// -// val inDir = TmpDir + "in-dir/" -// val inFilename = "in-file.tsv" -// val inPath = inDir + inFilename -// -// val outPath = TmpDir + "similarity-matrix/" -// -// -// // this creates one part-0000 file in the directory -// mahoutCtx.parallelize(TextDocs).coalesce(1, shuffle=true).saveAsTextFile(inDir) -// -// // to change from using part files to a single .tsv file we'll need to use HDFS -// val fs = FileSystem.get(new Configuration()) -// //rename part-00000 to something.tsv -// fs.rename(new Path(inDir + "part-00000"), new Path(inPath)) -// -// // local multi-threaded Spark with default HDFS -// RowSimilarityDriver.main(Array( -// "--input", inPath, -// "--output", outPath, -// "--maxSimilaritiesPerRow", "4", // would get all docs similar if we didn't limit them -// "--master", masterUrl)) -// -// val simLines = mahoutCtx.textFile(outPath).collect -// tokenize(simLines) should contain theSameElementsAs simDocsTokens -// } -// -// // convert into an Iterable of tokens for 'should contain theSameElementsAs Iterable' -// def tokenize(a: Iterable[String], splitString: String = "[\t ]"): Iterable[String] = { -// var r: Iterable[String] = Iterable() -// a.foreach ( l => r = r ++ l.split(splitString) ) -// r -// } -// -// override protected def beforeAll(configMap: ConfigMap) { -// super.beforeAll(configMap) -// RowSimilarityDriver.useContext(mahoutCtx) -// } -// -//} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.drivers + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.mahout.math.drm.RLikeDrmOps._ +import org.apache.mahout.math.drm._ +import org.apache.mahout.math.scalabindings.RLikeOps._ +import org.apache.mahout.math.scalabindings._ +import org.apache.mahout.sparkbindings._ +import org.apache.mahout.sparkbindings.test.DistributedSparkSuite +import org.scalatest.{ConfigMap, FunSuite} + + +class RowSimilarityDriverSuite extends FunSuite with DistributedSparkSuite { + + val TextDocs = Array( + "doc1\tNow is the time for all good people to come to aid of their party", + "doc2\tNow is the time for all good people to come to aid of their country", + "doc3\tNow is the time for all good people to come to aid of their hood", + "doc4\tNow is the time for all good people to come to aid of their friends", + "doc5\tNow is the time for all good people to come to aid of their looser brother", + "doc6\tThe quick brown fox jumped over the lazy dog", + "doc7\tThe quick brown fox jumped over the lazy boy", + "doc8\tThe quick brown fox jumped over the lazy cat", + "doc9\tThe quick brown fox jumped over the lazy wolverine", + "doc10\tThe quick brown fox jumped over the lazy cantelope")// yes that's spelled correctly. + + test("RowSimilarityDriver text docs no strengths") { + + val firstFiveSimDocsTokens = tokenize(Iterable( + "doc1\tdoc3 doc2 doc4 doc5")) + + val lastFiveSimDocsTokens = tokenize(Iterable( + "doc6\tdoc8 doc10 doc7 doc9")) + + val inDir = TmpDir + "in-dir/" + val inFilename = "in-file.tsv" + val inPath = inDir + inFilename + + val outPath = TmpDir + "similarity-matrices/" + + + // this creates one part-0000 file in the directory + mahoutCtx.parallelize(TextDocs).coalesce(1, shuffle=true).saveAsTextFile(inDir) + + // to change from using part files to a single .tsv file we'll need to use HDFS + val fs = FileSystem.get(new Configuration()) + //rename part-00000 to something.tsv + fs.rename(new Path(inDir + "part-00000"), new Path(inPath)) + + // local multi-threaded Spark with default HDFS + RowSimilarityDriver.main(Array( + "--input", inPath, + "--output", outPath, + "--omitStrength", + "--maxSimilaritiesPerRow", "4", // would get all docs similar if we didn't limit them + "--master", masterUrl)) + + val simLines = mahoutCtx.textFile(outPath).collect + simLines.foreach { line => + val lineTokens = line.split("[\t ]") + if (lineTokens.contains("doc1") ) // docs are two flavors so if only 4 similarities it will effectively classify + lineTokens should contain theSameElementsAs firstFiveSimDocsTokens + else + lineTokens should contain theSameElementsAs lastFiveSimDocsTokens + } + + } + + test("RowSimilarityDriver text docs") { + + val simDocsTokens = tokenize(Iterable( + "doc1\tdoc3:27.87301122947484 doc2:27.87301122947484 doc4:27.87301122947484 doc5:23.42278065550721", + "doc2\tdoc4:27.87301122947484 doc3:27.87301122947484 doc1:27.87301122947484 doc5:23.42278065550721", + "doc3\tdoc4:27.87301122947484 doc2:27.87301122947484 doc1:27.87301122947484 doc5:23.42278065550721", + "doc4\tdoc3:27.87301122947484 doc2:27.87301122947484 doc1:27.87301122947484 doc5:23.42278065550721", + "doc5\tdoc4:23.42278065550721 doc2:23.42278065550721 doc3:23.42278065550721 doc1:23.42278065550721", + "doc6\tdoc8:22.936393049704463 doc10:22.936393049704463 doc7:22.936393049704463 doc9:22.936393049704463", + "doc7\tdoc6:22.936393049704463 doc8:22.936393049704463 doc10:22.936393049704463 doc9:22.936393049704463", + "doc8\tdoc6:22.936393049704463 doc10:22.936393049704463 doc7:22.936393049704463 doc9:22.936393049704463", + "doc9\tdoc6:22.936393049704463 doc8:22.936393049704463 doc10:22.936393049704463 doc7:22.936393049704463", + "doc10\tdoc6:22.936393049704463 doc8:22.936393049704463 doc7:22.936393049704463 doc9:22.936393049704463")) + + val inDir = TmpDir + "in-dir/" + val inFilename = "in-file.tsv" + val inPath = inDir + inFilename + + val outPath = TmpDir + "similarity-matrix/" + + + // this creates one part-0000 file in the directory + mahoutCtx.parallelize(TextDocs).coalesce(1, shuffle=true).saveAsTextFile(inDir) + + // to change from using part files to a single .tsv file we'll need to use HDFS + val fs = FileSystem.get(new Configuration()) + //rename part-00000 to something.tsv + fs.rename(new Path(inDir + "part-00000"), new Path(inPath)) + + // local multi-threaded Spark with default HDFS + RowSimilarityDriver.main(Array( + "--input", inPath, + "--output", outPath, + "--maxSimilaritiesPerRow", "4", // would get all docs similar if we didn't limit them + "--master", masterUrl)) + + val simLines = mahoutCtx.textFile(outPath).collect + tokenize(simLines) should contain theSameElementsAs simDocsTokens + } + + // convert into an Iterable of tokens for 'should contain theSameElementsAs Iterable' + def tokenize(a: Iterable[String], splitString: String = "[\t ]"): Iterable[String] = { + var r: Iterable[String] = Iterable() + a.foreach ( l => r = r ++ l.split(splitString) ) + r + } + + override protected def beforeAll(configMap: ConfigMap) { + super.beforeAll(configMap) + RowSimilarityDriver.useContext(mahoutCtx) + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/67f3a65c/spark/src/test/scala/org/apache/mahout/drivers/TextDelimitedReaderWriterSuite.scala ---------------------------------------------------------------------- diff --git a/spark/src/test/scala/org/apache/mahout/drivers/TextDelimitedReaderWriterSuite.scala b/spark/src/test/scala/org/apache/mahout/drivers/TextDelimitedReaderWriterSuite.scala index 8e56f1e..5d92cca 100644 --- a/spark/src/test/scala/org/apache/mahout/drivers/TextDelimitedReaderWriterSuite.scala +++ b/spark/src/test/scala/org/apache/mahout/drivers/TextDelimitedReaderWriterSuite.scala @@ -1,53 +1,53 @@ -///* -// * Licensed to the Apache Software Foundation (ASF) under one or more -// * contributor license agreements. See the NOTICE file distributed with -// * this work for additional information regarding copyright ownership. -// * The ASF licenses this file to You under the Apache License, Version 2.0 -// * (the "License"); you may not use this file except in compliance with -// * the License. You may obtain a copy of the License at -// * -// * http://www.apache.org/licenses/LICENSE-2.0 -// * -// * Unless required by applicable law or agreed to in writing, software -// * distributed under the License is distributed on an "AS IS" BASIS, -// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// * See the License for the specific language governing permissions and -// * limitations under the License. -// */ -// -//package org.apache.mahout.drivers -// -//import org.apache.mahout.math.indexeddataset.DefaultIndexedDatasetReadSchema -//import org.apache.mahout.sparkbindings._ -//import org.apache.mahout.sparkbindings.test.DistributedSparkSuite -//import org.scalatest.FunSuite -// -//import scala.collection.JavaConversions._ -// -//class TextDelimitedReaderWriterSuite extends FunSuite with DistributedSparkSuite { -// test("indexedDatasetDFSRead should read sparse matrix file with null rows") { -// val OutFile = TmpDir + "similarity-matrices/part-00000" -// -// val lines = Array( -// "galaxy\tnexus:1.0", -// "ipad\tiphone:2.0", -// "nexus\tgalaxy:3.0", -// "iphone\tipad:4.0", -// "surface" -// ) -// val linesRdd = mahoutCtx.parallelize(lines).saveAsTextFile(OutFile) -// -// val data = mahoutCtx.indexedDatasetDFSRead(OutFile, DefaultIndexedDatasetReadSchema) -// -// data.rowIDs.toMap.keySet should equal(Set("galaxy", "ipad", "nexus", "iphone", "surface")) -// data.columnIDs.toMap.keySet should equal(Set("nexus", "iphone", "galaxy", "ipad")) -// -// val a = data.matrix.collect -// a.setRowLabelBindings(mapAsJavaMap(data.rowIDs.toMap).asInstanceOf[java.util.Map[java.lang.String, java.lang.Integer]]) -// a.setColumnLabelBindings(mapAsJavaMap(data.columnIDs.toMap).asInstanceOf[java.util.Map[java.lang.String, java.lang.Integer]]) -// a.get("galaxy", "nexus") should equal(1.0) -// a.get("ipad", "iphone") should equal(2.0) -// a.get("nexus", "galaxy") should equal(3.0) -// a.get("iphone", "ipad") should equal(4.0) -// } -//} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.drivers + +import org.apache.mahout.math.indexeddataset.DefaultIndexedDatasetReadSchema +import org.apache.mahout.sparkbindings._ +import org.apache.mahout.sparkbindings.test.DistributedSparkSuite +import org.scalatest.FunSuite + +import scala.collection.JavaConversions._ + +class TextDelimitedReaderWriterSuite extends FunSuite with DistributedSparkSuite { + test("indexedDatasetDFSRead should read sparse matrix file with null rows") { + val OutFile = TmpDir + "similarity-matrices/part-00000" + + val lines = Array( + "galaxy\tnexus:1.0", + "ipad\tiphone:2.0", + "nexus\tgalaxy:3.0", + "iphone\tipad:4.0", + "surface" + ) + val linesRdd = mahoutCtx.parallelize(lines).saveAsTextFile(OutFile) + + val data = mahoutCtx.indexedDatasetDFSRead(OutFile, DefaultIndexedDatasetReadSchema) + + data.rowIDs.toMap.keySet should equal(Set("galaxy", "ipad", "nexus", "iphone", "surface")) + data.columnIDs.toMap.keySet should equal(Set("nexus", "iphone", "galaxy", "ipad")) + + val a = data.matrix.collect + a.setRowLabelBindings(mapAsJavaMap(data.rowIDs.toMap).asInstanceOf[java.util.Map[java.lang.String, java.lang.Integer]]) + a.setColumnLabelBindings(mapAsJavaMap(data.columnIDs.toMap).asInstanceOf[java.util.Map[java.lang.String, java.lang.Integer]]) + a.get("galaxy", "nexus") should equal(1.0) + a.get("ipad", "iphone") should equal(2.0) + a.get("nexus", "galaxy") should equal(3.0) + a.get("iphone", "ipad") should equal(4.0) + } +}
