Repository: mahout Updated Branches: refs/heads/master 1d198100a -> defbbd20f
NOJIRA Fix LastFM CCO Row Cardinality Bug closes apache/mahout#351 Project: http://git-wip-us.apache.org/repos/asf/mahout/repo Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/defbbd20 Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/defbbd20 Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/defbbd20 Branch: refs/heads/master Commit: defbbd20f78c7b9e0bcc3a81d3d79d76be32cf23 Parents: 1d19810 Author: Trevor a.k.a @rawkintrevo <[email protected]> Authored: Tue Nov 28 15:37:29 2017 -0600 Committer: Trevor a.k.a @rawkintrevo <[email protected]> Committed: Tue Nov 28 15:37:29 2017 -0600 ---------------------------------------------------------------------- .../docs/tutorials/cco-lastfm/cco-lastfm.scala | 33 ++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mahout/blob/defbbd20/website/docs/tutorials/cco-lastfm/cco-lastfm.scala ---------------------------------------------------------------------- diff --git a/website/docs/tutorials/cco-lastfm/cco-lastfm.scala b/website/docs/tutorials/cco-lastfm/cco-lastfm.scala index 6ba46a9..709ab2a 100644 --- a/website/docs/tutorials/cco-lastfm/cco-lastfm.scala +++ b/website/docs/tutorials/cco-lastfm/cco-lastfm.scala @@ -32,10 +32,39 @@ val userArtistsIDS = IndexedDatasetSpark.apply(userArtistsRDD)(sc) val userFriendsRDD = sc.textFile("/path/to/data/lastfm/user_friends.dat").map(line => line.split("\t")).map(a => (a(0), a(1))).filter(_._1 != "userID") val userFriendsIDS = IndexedDatasetSpark.apply(userFriendsRDD)(sc) -import org.apache.mahout.math.cf.SimilarityAnalysis +val primaryIDS = userFriendsIDS +val secondaryActionRDDs = List(userArtistsRDD, userTagsRDD) + +import org.apache.mahout.math.indexeddataset.{IndexedDataset, BiDictionary} + +def adjustRowCardinality(rowCardinality: Integer, datasetA: IndexedDataset): IndexedDataset = { + val returnedA = if (rowCardinality != datasetA.matrix.nrow) datasetA.newRowCardinality(rowCardinality) + else datasetA // this guarantees matching cardinality + + returnedA +} + +var rowCardinality = primaryIDS.rowIDs.size -val artistReccosLlrDrmListByArtist = SimilarityAnalysis.cooccurrencesIDSs(Array(userArtistsIDS, userTagsIDS, userFriendsIDS), maxInterestingItemsPerThing = 20, maxNumInteractions = 500, randomSeed = 1234) +val secondaryActionIDS: Array[IndexedDataset] = new Array[IndexedDataset](secondaryActionRDDs.length) +for (i <- secondaryActionRDDs.indices) { + + val bcPrimaryRowIDs = sc.broadcast(primaryIDS.rowIDs) + bcPrimaryRowIDs.value + + val tempRDD = secondaryActionRDDs(i).filter(a => bcPrimaryRowIDs.value.contains(a._1)) + + var tempIDS = IndexedDatasetSpark.apply(tempRDD, existingRowIDs = Some(primaryIDS.rowIDs))(sc) + secondaryActionIDS(i) = adjustRowCardinality(rowCardinality,tempIDS) +} + +import org.apache.mahout.math.cf.SimilarityAnalysis +val artistReccosLlrDrmListByArtist = SimilarityAnalysis.cooccurrencesIDSs( + Array(primaryIDS, secondaryActionIDS(0), secondaryActionIDS(1)), + maxInterestingItemsPerThing = 20, + maxNumInteractions = 500, + randomSeed = 1234) // Anonymous User val artistMap = sc.textFile("/path/to/lastfm/artists.dat").map(line => line.split("\t")).map(a => (a(1), a(0))).filter(_._1 != "name").collect.toMap
