But RecommenderJob seems to call RowSimilarityJob first. That is where sampling needs to be done.
//calculate the co-occurrence matrix ToolRunner.run(getConf(), new RowSimilarityJob(), new String[]{ "--input", new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(), "--output", similarityMatrixPath.toString(), "--numberOfColumns", String.valueOf(numberOfUsers), "--similarityClassname", similarityClassname, "--maxSimilaritiesPerRow", String.valueOf(maxSimilaritiesPerItem), "--excludeSelfSimilarity", String.valueOf(Boolean.TRUE), "--threshold", String.valueOf(threshold), "--tempDir", getTempPath().toString(), }); // write out the similarity matrix if the user specified that behavior if (hasOption("outputPathForSimilarityMatrix")) { Path outputPathForSimilarityMatrix = new Path(getOption("outputPathForSimilarityMatrix")); Job outputSimilarityMatrix = prepareJob(similarityMatrixPath, outputPathForSimilarityMatrix, SequenceFileInputFormat.class, ItemSimilarityJob.MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class, ItemSimilarityJob.MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class, TextOutputFormat.class); Configuration mostSimilarItemsConf = outputSimilarityMatrix.getConfiguration(); mostSimilarItemsConf.set(ItemSimilarityJob.ITEM_ID_INDEX_PATH_STR, new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString()); mostSimilarItemsConf.setInt(ItemSimilarityJob.MAX_SIMILARITIES_PER_ITEM, maxSimilaritiesPerItem); outputSimilarityMatrix.waitForCompletion(true); } } On Tue, Jun 18, 2013 at 10:47 PM, Sean Owen <sro...@gmail.com> wrote: > No, it's in ItemSimilarityJob -- I'm looking at it now. It ends up > setting ToItemVectorsMapper.SAMPLE_SIZE, if that helps. > > On Tue, Jun 18, 2013 at 9:43 PM, Ted Dunning <ted.dunn...@gmail.com> > wrote: > > Ahh... only effective in RecommenderJob. >