Hello,
I am trying to implement an item based recommender with a custom ItemSimilarity.
I've used the movielens data for the test and the item similarity uses the
movie genre to create the similarity value.
I've followed the advice in the book and wrote a very simple app to see it in
action.
When I run the code, the results that I get back do not make a lot of sense.
For ex., below are the recommendations I get for user 1 is :
1450 : 5.0 ->'1450 Prisoner of the Mountains, 1996, War'
1289 : 5.0 ->'1289 Koyaanisqatsi, 1983, Documentary War'
760 : 5.0 ->'760 Stalingrad, 1993, War'
632 : 5.0 ->'632 Land and Freedom, 1995, War'
665 : 5.0 ->'665 Underground, 1995, War'
Movies
Wacthed:53>Crime:2,Adventure:5,Action:5,War:2,Fantasy:3,Romance:6,Animation:39,Children's:20,Sci-Fi:3,Musical:14,Comedy:14,Thriller:3
In the last line of the log we see that, the user has watched a lot of movies
with genre Animation,Children's,Musical yet the reocmmendations are all from
the genre War.
I've repeated the test for many different users, and all the recommendations
that I got were out of line with the user history.
Can anyone tell me what I'm doing wrong?
Here is the code:
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.List;
import org.apache.mahout.cf.taste.common.Refreshable;
import org.apache.mahout.cf.taste.common.TasteException;
import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
import org.apache.mahout.cf.taste.impl.model.file.FileDataModel;
import org.apache.mahout.cf.taste.impl.recommender.GenericItemBasedRecommender;
import org.apache.mahout.cf.taste.model.DataModel;
import org.apache.mahout.cf.taste.recommender.RecommendedItem;
import org.apache.mahout.cf.taste.recommender.Recommender;
import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
public class MahoutTest {
public static String originalDataFile = "C:\\Tmp\\ml-1m\\ratings.csv";
public static String revertedDataFile =
"C:\\Tmp\\ml-1m\\ratings_reverted.csv";
public static int PERSON_ID = 1;
public static void main(String[] args) throws Exception {
MovieExtractor genreExtractor = new MovieExtractor(
"C:\\Tmp\\ml-1m\\movies.dat");
genreExtractor.init();
DataModel model = new FileDataModel(new File(originalDataFile));
Recommender recommender = new ExtendedItemBaseRecommender(
genreExtractor).buildRecommender(model);
//
List<RecommendedItem> list = recommender.recommend(PERSON_ID, 5);
for (RecommendedItem recommendedItem : list) {
System.out
.println(recommendedItem.getItemID()
+ " : "
+ recommendedItem.getValue()
+ " ->"
+ genreExtractor.getMovie((int) recommendedItem
.getItemID()));
}
DataExtractor dataExtractor = new DataExtractor(originalDataFile,
genreExtractor);
System.out.println(dataExtractor.parseUserMovieGenres(PERSON_ID));
}
public static class ExtendedItemBaseRecommender implements
RecommenderBuilder {
private MovieExtractor extractor;
public ExtendedItemBaseRecommender(MovieExtractor extractor) {
this.extractor = extractor;
}
public Recommender buildRecommender(DataModel model)
throws TasteException {
return new GenericItemBasedRecommender(model, new
MyItemSimilarity(extractor));
}
}
public static String list2String(Collection<String> list, String seperator)
{
StringBuffer buffer = new StringBuffer();
for (String obj : list) {
buffer.append(obj).append(seperator);
}
return buffer.toString().trim();
}
public static class MyItemSimilarity implements ItemSimilarity {
private MovieExtractor extractor;
public MyItemSimilarity(MovieExtractor extractor) throws TasteException
{
this.extractor = extractor;
}
public double itemSimilarity(long itemID1, long itemID2) {
try {
double similarity = 0;
final Movie movie1 = extractor.getMovie((int) itemID1);
final Movie movie2 = extractor.getMovie((int) itemID2);
if (movie1.getGenre() == movie2.getGenre()) {
similarity = 0.8d;
} else {
if ((movie1.getGenre() & movie2.getGenre()) > 0) {
similarity = 0.6d;
}
}
return similarity;
} catch (Exception e) {
e.printStackTrace();
return 0;
}
}
public void refresh(Collection<Refreshable> arg0) {
}
public long[] allSimilarItemIDs(long arg0) throws TasteException {
return null;
}
public double[] itemSimilarities(long itemID1, long[] itemID2s)
throws TasteException {
double[] result = new double[itemID2s.length];
for (int i = 0; i < result.length; i++) {
result[i] = itemSimilarity(itemID1, itemID2s[i]);
}
return result;
}
}
public static class Movie {
private final int id;
private final int genre;
private final String name;
private final int year;
public Movie(int id, int genre, String name, int year) {
this.id = id;
this.genre = genre;
this.name = name;
this.year = year;
}
public int getGenre() {
return genre;
}
public String getName() {
return name;
}
public int getYear() {
return year;
}
public int getId() {
return id;
}
@Override
public String toString() {
return "'" + id + " " + name + ", " + year + ", "
+ MovieExtractor.displayGenres(genre) + "'";
}
}
public static class MovieExtractor {
private String file;
public static int GENRE_Action = 1;
public static int GENRE_Adventure = 2;
public static int GENRE_Animation = 4;
public static int GENRE_Children = 8;
public static int GENRE_Comedy = 16;
public static int GENRE_Crime = 32;
public static int GENRE_Documentary = 64;
public static int GENRE_Drama = 128;
public static int GENRE_Fantasy = 256;
public static int GENRE_FilmNoir = 512;
public static int GENRE_Horror = 1024;
public static int GENRE_Musical = 2048;
public static int GENRE_Mystery = 4096;
public static int GENRE_Romance = 8192;
public static int GENRE_SciFi = 16384;
public static int GENRE_Thriller = 32768;
public static int GENRE_War = 65536;
public static int GENRE_Western = 131072;
private final HashMap<Integer, Movie> dictionary = new HashMap<Integer,
Movie>();
public MovieExtractor(String file) {
this.file = file;
}
public void init() {
try {
BufferedReader reader = new BufferedReader(new FileReader(
this.file));
for (String line = reader.readLine(); line != null; line =
reader
.readLine()) {
String parts[] = line.split("::");
final int id = Integer.parseInt(parts[0]);
final String name = parts[1].substring(0,
parts[1].indexOf('(')).trim();
final int year = Integer.parseInt(parts[1].substring(
parts[1].lastIndexOf('(') + 1,
parts[1].lastIndexOf(')')));
final int genre = calculateGenre(parts[2]);
dictionary.put(id, new Movie(id, genre, name, year));
}
} catch (Exception e) {
e.printStackTrace();
}
}
private int calculateGenre(String genre) {
if (genre == null || genre.length() == 0)
return 0;
if (genre.indexOf('|') <= 0) {
return findGenre(genre);
} else {
int result = 0;
String[] split = genre.split("\\|");
for (int i = 0; i < split.length; i++) {
String part = split[i];
if (part != null && part.length() > 0) {
result |= findGenre(part);
}
}
return result;
}
}
private int findGenre(String genre) {
if (genre.equalsIgnoreCase("Action")) {
return GENRE_Action;
} else if (genre.equalsIgnoreCase("Adventure")) {
return GENRE_Adventure;
} else if (genre.equalsIgnoreCase("Animation")) {
return GENRE_Animation;
} else if (genre.equalsIgnoreCase("Children's")) {
return GENRE_Children;
} else if (genre.equalsIgnoreCase("Comedy")) {
return GENRE_Comedy;
} else if (genre.equalsIgnoreCase("Crime")) {
return GENRE_Crime;
} else if (genre.equalsIgnoreCase("Documentary")) {
return GENRE_Documentary;
} else if (genre.equalsIgnoreCase("Drama")) {
return GENRE_Animation;
} else if (genre.equalsIgnoreCase("Fantasy")) {
return GENRE_Fantasy;
} else if (genre.equalsIgnoreCase("Film-Noir")) {
return GENRE_FilmNoir;
} else if (genre.equalsIgnoreCase("Horror")) {
return GENRE_Horror;
} else if (genre.equalsIgnoreCase("Musical")) {
return GENRE_Musical;
} else if (genre.equalsIgnoreCase("Mystery")) {
return GENRE_Mystery;
} else if (genre.equalsIgnoreCase("Romance")) {
return GENRE_Romance;
} else if (genre.equalsIgnoreCase("Sci-Fi")) {
return GENRE_SciFi;
} else if (genre.equalsIgnoreCase("Thriller")) {
return GENRE_Thriller;
} else if (genre.equalsIgnoreCase("War")) {
return GENRE_War;
} else if (genre.equalsIgnoreCase("Western")) {
return GENRE_Western;
} else
return 0;
}
public Movie getMovie(int id) {
return dictionary.get(id);
}
public static List<String> getGenresAsList(int genre) {
final List<String> list = new ArrayList<String>();
filterGenre(genre, "Action", GENRE_Action, list);
filterGenre(genre, "Adventure", GENRE_Adventure, list);
filterGenre(genre, "Animation", GENRE_Animation, list);
filterGenre(genre, "Children's", GENRE_Children, list);
filterGenre(genre, "Comedy", GENRE_Comedy, list);
filterGenre(genre, "Crime", GENRE_Crime, list);
filterGenre(genre, "Documentary", GENRE_Documentary, list);
filterGenre(genre, "Drama", GENRE_Drama, list);
filterGenre(genre, "Fantasy", GENRE_Fantasy, list);
filterGenre(genre, "FilmNoir", GENRE_FilmNoir, list);
filterGenre(genre, "Horror", GENRE_Horror, list);
filterGenre(genre, "Musical", GENRE_Musical, list);
filterGenre(genre, "Mystery", GENRE_Mystery, list);
filterGenre(genre, "Romance", GENRE_Romance, list);
filterGenre(genre, "Sci-Fi", GENRE_SciFi, list);
filterGenre(genre, "Thriller", GENRE_Thriller, list);
filterGenre(genre, "War", GENRE_War, list);
filterGenre(genre, "Western", GENRE_Western, list);
return list;
}
public static String displayGenres(int genre) {
List<String> list = getGenresAsList(genre);
return list2String(list, " ");
}
private static void filterGenre(int genre, String genreName,
int genreFilter, final List<String> list) {
if ((genre & genreFilter) == genreFilter) {
list.add(genreName);
}
}
}
public static class DataExtractor {
private final String file;
private MovieExtractor movieExtractor;
public DataExtractor(String file, MovieExtractor movieExtractor) {
this.file = file;
this.movieExtractor = movieExtractor;
}
public String parseUserMovieGenres(int targetUserID) {
try {
int totalGenreCount = 0;
int movieCount = 0;
Map<String, Integer> genreMap = new HashMap<String, Integer>();
BufferedReader reader = new BufferedReader(new FileReader(
this.file));
for (String line = reader.readLine(); line != null; line =
reader
.readLine()) {
String parts[] = line.split(",");
final int userID = Integer.parseInt(parts[0]);
if (userID == targetUserID) {
movieCount++;
final int movieID = Integer.parseInt(parts[1]);
final Movie movie = movieExtractor.getMovie(movieID);
List<String> genresForMovie = MovieExtractor
.getGenresAsList(movie.getGenre());
for (String genre : genresForMovie) {
int count = 0;
if (genreMap.containsKey(genre)) {
count = genreMap.get(genre) + 1;
} else {
count = 1;
}
totalGenreCount += 1;
genreMap.put(genre, count);
}
}
}
StringBuffer buffer = new StringBuffer();
buffer.append("Movies Wacthed:" + movieCount + ">");
for (String key : genreMap.keySet()) {
buffer.append(key).append(":").append(genreMap.get(key))
.append(",");
}
return buffer.toString();
} catch (Exception e) {
e.printStackTrace();
return "";
}
}
}
}
Best regards,
Uzay