Repository: mahout Updated Branches: refs/heads/master 8b2bec7f5 -> 24cb5576f
MAHOUT-1541 backed out compatability with legacy Item Similarity, now outputs raw LLR scores Project: http://git-wip-us.apache.org/repos/asf/mahout/repo Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/24cb5576 Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/24cb5576 Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/24cb5576 Branch: refs/heads/master Commit: 24cb5576f720737b73906ebb15be486d540ac629 Parents: 8b2bec7 Author: pferrel <[email protected]> Authored: Sat Jul 5 13:36:24 2014 -0700 Committer: pferrel <[email protected]> Committed: Sat Jul 5 13:36:24 2014 -0700 ---------------------------------------------------------------------- .../apache/mahout/cf/CooccurrenceAnalysis.scala | 8 ++-- .../mahout/cf/CooccurrenceAnalysisSuite.scala | 50 ++++++++++---------- .../drivers/ItemSimilarityDriverSuite.scala | 41 ++++++++-------- 3 files changed, 51 insertions(+), 48 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mahout/blob/24cb5576/spark/src/main/scala/org/apache/mahout/cf/CooccurrenceAnalysis.scala ---------------------------------------------------------------------- diff --git a/spark/src/main/scala/org/apache/mahout/cf/CooccurrenceAnalysis.scala b/spark/src/main/scala/org/apache/mahout/cf/CooccurrenceAnalysis.scala index b01332c..14cc9d5 100644 --- a/spark/src/main/scala/org/apache/mahout/cf/CooccurrenceAnalysis.scala +++ b/spark/src/main/scala/org/apache/mahout/cf/CooccurrenceAnalysis.scala @@ -134,9 +134,11 @@ object CooccurrenceAnalysis extends Serializable { val llr = logLikelihoodRatio(numInteractionsB(thingB).toLong, numInteractionsA(thingA).toLong, cooccurrences.toLong, numUsers) - // matches hadoop code and maps values to range (0..1) - val tLLR = 1.0 - (1.0 / (1.0 + llr)) - val candidate = thingA -> tLLR + val candidate = thingA -> llr + + // matches legacy hadoop code and maps values to range (0..1) + // val tLLR = 1.0 - (1.0 / (1.0 + llr)) + //val candidate = thingA -> tLLR // Enqueue item with score, if belonging to the top-k if (topItemsPerThing.size < maxInterestingItemsPerThing) { http://git-wip-us.apache.org/repos/asf/mahout/blob/24cb5576/spark/src/test/scala/org/apache/mahout/cf/CooccurrenceAnalysisSuite.scala ---------------------------------------------------------------------- diff --git a/spark/src/test/scala/org/apache/mahout/cf/CooccurrenceAnalysisSuite.scala b/spark/src/test/scala/org/apache/mahout/cf/CooccurrenceAnalysisSuite.scala index e46dad5..065f2f8 100644 --- a/spark/src/test/scala/org/apache/mahout/cf/CooccurrenceAnalysisSuite.scala +++ b/spark/src/test/scala/org/apache/mahout/cf/CooccurrenceAnalysisSuite.scala @@ -41,19 +41,19 @@ class CooccurrenceAnalysisSuite extends FunSuite with MahoutSuite with MahoutLoc // correct cooccurrence with LLR final val matrixLLRCoocAtAControl = dense( - (0.0, 0.6331745808516107, 0.0, 0.0, 0.0), - (0.6331745808516107, 0.0, 0.0, 0.0, 0.0), - (0.0, 0.0, 0.0, 0.6331745808516107, 0.0), - (0.0, 0.0, 0.6331745808516107, 0.0, 0.0), + (0.0, 1.7260924347106847, 0.0, 0.0, 0.0), + (1.7260924347106847, 0.0, 0.0, 0.0, 0.0), + (0.0, 0.0, 0.0, 1.7260924347106847, 0.0), + (0.0, 0.0, 1.7260924347106847, 0.0, 0.0), (0.0, 0.0, 0.0, 0.0, 0.0)) // correct cross-cooccurrence with LLR final val matrixLLRCoocBtAControl = dense( - (0.6331745808516107, 0.4046187819149094, 0.4046187819149094, 0.6331745808516107, 0.0), - (0.6331745808516107, 0.4046187819149094, 0.4046187819149094, 0.6331745808516107, 0.0), - (0.6331745808516107, 0.4046187819149094, 0.4046187819149094, 0.6331745808516107, 0.40461878191490940), - (0.6331745808516107, 0.4046187819149094, 0.4046187819149094, 0.6331745808516107, 0.0), - (0.0, 0.0, 0.0, 0.0, 0.8181382096075936)) + (1.7260924347106847, 0.6795961471815897, 0.6795961471815897, 1.7260924347106847, 0.0), + (1.7260924347106847, 0.6795961471815897, 0.6795961471815897, 1.7260924347106847, 0.0), + (1.7260924347106847, 0.6795961471815897, 0.6795961471815897, 1.7260924347106847, 0.6795961471815897), + (1.7260924347106847, 0.6795961471815897, 0.6795961471815897, 1.7260924347106847, 0.0), + (0.0, 0.0, 0.0, 0.0, 4.498681156950466)) @@ -90,16 +90,16 @@ class CooccurrenceAnalysisSuite extends FunSuite with MahoutSuite with MahoutLoc test("cooccurrence [A'A], [B'A] double data using LLR") { val a = dense( - (100000.0D, 1.0D, 0.0D, 0.0D, 0.0D), - (0.0D, 0.0D, 10.0D, 1.0D, 0.0D), - (0.0D, 0.0D, 0.0D, 0.0D, 1000.0D), - (1.0D, 0.0D, 0.0D, 10.0D, 0.0D)) + (100000.0D, 1.0D, 0.0D, 0.0D, 0.0D), + ( 0.0D, 0.0D, 10.0D, 1.0D, 0.0D), + ( 0.0D, 0.0D, 0.0D, 0.0D, 1000.0D), + ( 1.0D, 0.0D, 0.0D, 10.0D, 0.0D)) val b = dense( - (10000.0D, 100.0D, 1000.0D, 1.0D, 0.0D), - (10.0D, 1.0D, 10000000.0D, 10.0D, 0.0D), - (0.0D, 0.0D, 1000.0D, 0.0D, 100.0D), - (100.0D, 1.0D, 0.0D, 100000.0D, 0.0D)) + (10000.0D, 100.0D, 1000.0D, 1.0D, 0.0D), + ( 10.0D, 1.0D, 10000000.0D, 10.0D, 0.0D), + ( 0.0D, 0.0D, 1000.0D, 0.0D, 100.0D), + ( 100.0D, 1.0D, 0.0D, 100000.0D, 0.0D)) val drmA = drmParallelize(m = a, numPartitions = 2) val drmB = drmParallelize(m = b, numPartitions = 2) @@ -120,16 +120,16 @@ class CooccurrenceAnalysisSuite extends FunSuite with MahoutSuite with MahoutLoc test("cooccurrence [A'A], [B'A] integer data using LLR") { val a = dense( - (1000, 10, 0, 0, 0), - (0, 0, -10000, 10, 0), - (0, 0, 0, 0, 100), - (10000, 0, 0, 1000, 0)) + ( 1000, 10, 0, 0, 0), + ( 0, 0, -10000, 10, 0), + ( 0, 0, 0, 0, 100), + (10000, 0, 0, 1000, 0)) val b = dense( - (100, 1000, -10000, 10000, 0), - (10000, 1000, 100, 10, 0), - (0, 0, 10, 0, -100), - (10, 100, 0, 1000, 0)) + ( 100, 1000, -10000, 10000, 0), + (10000, 1000, 100, 10, 0), + ( 0, 0, 10, 0, -100), + ( 10, 100, 0, 1000, 0)) val drmA = drmParallelize(m = a, numPartitions = 2) val drmB = drmParallelize(m = b, numPartitions = 2) http://git-wip-us.apache.org/repos/asf/mahout/blob/24cb5576/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala ---------------------------------------------------------------------- diff --git a/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala b/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala index e4a75de..2827317 100644 --- a/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala +++ b/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala @@ -45,17 +45,18 @@ class ItemSimilarityDriverSuite extends FunSuite with MahoutSuite with MahoutLoc */ final val SelfSimilairtyTSV = Set( - "galaxy\tnexus:0.6331745808516107", - "ipad\tiphone:0.6331745808516107", - "nexus\tgalaxy:0.6331745808516107", - "iphone\tipad:0.6331745808516107", - "surface") - final val CrossSimilarityTSV = Set( - "galaxy\tnexus:0.4046187819149094,iphone:0.6331745808516107,ipad:0.4046187819149094,galaxy:0.6331745808516107", - "surface\tsurface:0.8181382096075936", - "nexus\tnexus:0.4046187819149094,iphone:0.6331745808516107,ipad:0.4046187819149094,surface:0.4046187819149094,galaxy:0.6331745808516107", - "ipad\tnexus:0.4046187819149094,iphone:0.6331745808516107,ipad:0.4046187819149094,galaxy:0.6331745808516107", - "iphone\tnexus:0.4046187819149094,iphone:0.6331745808516107,ipad:0.4046187819149094,galaxy:0.6331745808516107") + "galaxy\tnexus:1.7260924347106847", + "ipad\tiphone:1.7260924347106847", + "nexus\tgalaxy:1.7260924347106847", + "iphone\tipad:1.7260924347106847", + "surface") + + final val CrossSimilarityTSV = Set("" + + "nexus\tnexus:0.6795961471815897,iphone:1.7260924347106847,ipad:0.6795961471815897,surface:0.6795961471815897,galaxy:1.7260924347106847", + "ipad\tnexus:0.6795961471815897,iphone:1.7260924347106847,ipad:0.6795961471815897,galaxy:1.7260924347106847", + "surface\tsurface:4.498681156950466", + "iphone\tnexus:0.6795961471815897,iphone:1.7260924347106847,ipad:0.6795961471815897,galaxy:1.7260924347106847", + "galaxy\tnexus:0.6795961471815897,iphone:1.7260924347106847,ipad:0.6795961471815897,galaxy:1.7260924347106847") final val TmpDir = "tmp/" // all IO going to whatever the default HDFS config is pointing to @@ -88,7 +89,7 @@ class ItemSimilarityDriverSuite extends FunSuite with MahoutSuite with MahoutLoc )) */ - ignore ("ItemSimilarityDriver, non-full-spec CSV"){ + test ("ItemSimilarityDriver, non-full-spec CSV"){ val InFile = TmpDir + "in-file.csv/" //using part files, not singel file val OutPath = TmpDir + "indicator-matrices/" @@ -143,7 +144,7 @@ class ItemSimilarityDriverSuite extends FunSuite with MahoutSuite with MahoutLoc - ignore ("ItemSimilarityDriver TSV "){ + test ("ItemSimilarityDriver TSV "){ val InFile = TmpDir + "in-file.tsv/" val OutPath = TmpDir + "indicator-matrices/" @@ -197,7 +198,7 @@ class ItemSimilarityDriverSuite extends FunSuite with MahoutSuite with MahoutLoc } - ignore ("ItemSimilarityDriver log-ish files"){ + test ("ItemSimilarityDriver log-ish files"){ val InFile = TmpDir + "in-file.log/" val OutPath = TmpDir + "indicator-matrices/" @@ -251,7 +252,7 @@ class ItemSimilarityDriverSuite extends FunSuite with MahoutSuite with MahoutLoc } - ignore ("ItemSimilarityDriver legacy supported file format"){ + test ("ItemSimilarityDriver legacy supported file format"){ val InDir = TmpDir + "in-dir/" val InFilename = "in-file.tsv" @@ -269,11 +270,11 @@ class ItemSimilarityDriverSuite extends FunSuite with MahoutSuite with MahoutLoc "3,3,1") val Answer = Set( - "0\t1:0.6331745808516107", - "3\t2:0.6331745808516107", - "1\t0:0.6331745808516107", + "0\t1:1.7260924347106847", + "3\t2:1.7260924347106847", + "1\t0:1.7260924347106847", "4", - "2\t3:0.6331745808516107") + "2\t3:1.7260924347106847") // this creates one part-0000 file in the directory mahoutCtx.parallelize(lines).coalesce(1, shuffle=true).saveAsTextFile(InDir) @@ -298,7 +299,7 @@ class ItemSimilarityDriverSuite extends FunSuite with MahoutSuite with MahoutLoc } - ignore("ItemSimilarityDriver recursive file discovery using filename patterns"){ + test("ItemSimilarityDriver recursive file discovery using filename patterns"){ //directory structure using the following // tmp/data/m1.tsv // tmp/data/more-data/another-dir/m2.tsv
