Updated Branches: refs/heads/master 814cc1d39 -> 8e249d30e
Modularize dataset downloads, configurable iterations, and comments to all tests for more maintainable mahout tests. Also updates the url for movie lens data set as per original goal of this JIRA. Project: http://git-wip-us.apache.org/repos/asf/bigtop/repo Commit: http://git-wip-us.apache.org/repos/asf/bigtop/commit/8e249d30 Tree: http://git-wip-us.apache.org/repos/asf/bigtop/tree/8e249d30 Diff: http://git-wip-us.apache.org/repos/asf/bigtop/diff/8e249d30 Branch: refs/heads/master Commit: 8e249d30ee2587a7d73896dbf0ce79a09419da51 Parents: 814cc1d Author: jayunit100 <[email protected]> Authored: Thu Nov 14 14:11:52 2013 -0500 Committer: Bruno Mahé <[email protected]> Committed: Sat Nov 16 13:51:29 2013 -0800 ---------------------------------------------------------------------- .../mahout/smoke/TestMahoutExamples.groovy | 132 ++++++++++++++----- 1 file changed, 97 insertions(+), 35 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/bigtop/blob/8e249d30/bigtop-tests/test-artifacts/mahout/src/main/groovy/org/apache/bigtop/itest/mahout/smoke/TestMahoutExamples.groovy ---------------------------------------------------------------------- diff --git a/bigtop-tests/test-artifacts/mahout/src/main/groovy/org/apache/bigtop/itest/mahout/smoke/TestMahoutExamples.groovy b/bigtop-tests/test-artifacts/mahout/src/main/groovy/org/apache/bigtop/itest/mahout/smoke/TestMahoutExamples.groovy index 66b614f..9e50350 100644 --- a/bigtop-tests/test-artifacts/mahout/src/main/groovy/org/apache/bigtop/itest/mahout/smoke/TestMahoutExamples.groovy +++ b/bigtop-tests/test-artifacts/mahout/src/main/groovy/org/apache/bigtop/itest/mahout/smoke/TestMahoutExamples.groovy @@ -16,7 +16,7 @@ * limitations under the License. */ package org.apache.bigtop.itest.mahout.smoke; - +import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import org.junit.AfterClass; @@ -44,48 +44,75 @@ public class TestMahoutExamples { private static Shell sh = new Shell("/bin/bash -s"); public static String download_dir = System.getProperty("mahout.examples.resources.download.path") ?: "/tmp" ; + + /** + * Mahout smokes rely on a lot of external files. So we + * modularize the downloads into a single function, so that + * the setup is easier to debug. If any download results in a + * small file (i.e. due to 404 or 500 error), assertion will fail + * before the smokes actually start. + */ + public static void download(){ + + //key value pairs : data file -> url that file resides on. + def urlmap = [ + "20news-bydate.tar.gz": + "http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz" , + + "reuters21578.tar.gz": + "http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz", + + "synthetic_control.data": + "http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data", + + "ml-1m.zip": + "http://files.grouplens.org/papers/ml-1m.zip" + ]; + //For each url above, download it. + urlmap.each() { + f_name,loc -> + sh.exec("if [ ! -f ${download_dir}/${f_name} ]; then " + + "curl ${loc} -o ${download_dir}/${f_name}; " + + "fi"); + File file = new File("${download_dir}/${f_name}"); + + assertTrue("file "+ f_name + " at "+loc + " len=" + file.length() + " is > 5k bytes", file.length() > 5000 ); + } + + } + + /** + * Individual tests (i.e. movie lens factorizer) will selectively copy this directory into the + * distributed file system & then run tests against it (i.e. movie lens factorizer uses "fs -put" after + * formatting a csv file in the tmp dir). + */ @BeforeClass public static void setUp() { - // download resources - sh.exec( - "if [ ! -f ${download_dir}/20news-bydate.tar.gz ]; then " + - "curl http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz -o ${download_dir}/20news-bydate.tar.gz; " + - "fi"); - sh.exec( - "if [ ! -f ${download_dir}/reuters21578.tar.gz ]; then " + - "curl http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz -o ${download_dir}/reuters21578.tar.gz; " + - "fi"); - sh.exec( - "if [ ! -f ${download_dir}/synthetic_control.data ]; then " + - "curl http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data -o ${download_dir}/synthetic_control.data; " + - "fi"); - sh.exec( - "if [ ! -f ${download_dir}/ml-1m.zip ]; then " + - "curl http://www.grouplens.org/system/files/ml-1m.zip -o ${download_dir}/ml-1m.zip; " + - "fi"); + download(); + // uncompress archives - // 20news-bydate.tar.gz - // reuters21578.tar.gz - // ml-1m.zip sh.exec("mkdir ${TEMP_DIR}", "cd ${TEMP_DIR}", - "mkdir 20news-bydate", + //Create news-date data dir :: input for classifier test + "mkdir 20news-bydate", "cd 20news-bydate", - "tar xzf ${download_dir}/20news-bydate.tar.gz", + "tar xzf ${download_dir}/20news-bydate.tar.gz", "cd ..", + //Create news-all data directory :: input for LDA test "mkdir 20news-all", "cp -R 20news-bydate/*/* 20news-all", "mkdir reuters-sgm", "cd reuters-sgm", "tar xzf ${download_dir}/reuters21578.tar.gz", "cd ..", + //Create movie lens data directory :: input data for movie recommender test "mkdir movielens", "cd movielens", "unzip ${download_dir}/ml-1m.zip"); assertEquals("Failed to uncompress archives", 0, sh.getRet()); sh.exec("hadoop fs -mkdir ${WORK_DIR}"); - assertEquals("Unable to create work dir in hdfs", 0, sh.getRet()); + assertEquals("Unable to create work dir in HCFS", 0, sh.getRet()); rmr("temp"); } @@ -94,14 +121,18 @@ public class TestMahoutExamples { */ public void assertRun(String mahoutJob){ final String cmd = MAHOUT+" "+mahoutJob; - sh.exec(cmd); - assertEquals("Failed to run: "+cmd, 0, sh.getRet()); + + //Cat the commands to a central file thats easy to tail. + //TODO a simpler + sh.exec("echo \""+cmd+"\" >> /var/log/mahout.smoke"); + sh.exec(cmd); + assertEquals("non-zero return! :::: "+cmd + " :::: out= " + sh.out + " :::: err= "+sh.err, 0, sh.getRet()); } @AfterClass public static void tearDown() { - sh.exec("rm -rf ${TEMP_DIR}", - "hadoop fs -rmr ${WORK_DIR}"); + sh.exec("rm -rf ${TEMP_DIR}", + "hadoop fs -rmr ${WORK_DIR}"); } private static void rmr(String path) { @@ -122,6 +153,16 @@ public class TestMahoutExamples { } } + //iterations for factorizer, original value was "10", + //on a small 4 node cluster, 2 iterations + //should complete in about 5 minutes or so. + static final int ITERATIONS=2; + + /** + * This is the full workflow for creating recommendations based on movie + * ratings including creating training/test data, ALS for training, evaluating + * the ALS, and then outputting final movie recommendations for users. + */ @Test(timeout=12000000L) public void factorizeMovieLensRatings() { // convert ratings @@ -137,21 +178,26 @@ public class TestMahoutExamples { assertRun("splitDataset --input ${WORK_DIR}/movielens/ratings.csv --output ${WORK_DIR}/dataset " + "--trainingPercentage 0.9 --probePercentage 0.1 --tempDir ${WORK_DIR}/dataset/tmp"); - //run distributed ALS-WR to factorize the rating matrix based on the training set + //Default iterations was 10, but for simple smokes that most might run, + //2 iterations will confirm enough to move on. + + //run distributed ALS-WR to factorize the rating matrix based on the training set + assertRun("parallelALS --input ${WORK_DIR}/dataset/trainingSet/ --output ${WORK_DIR}/als/out " + - "--tempDir ${WORK_DIR}/als/tmp --numFeatures 20 --numIterations 10 --lambda 0.065"); + "--tempDir ${WORK_DIR}/als/tmp --numFeatures 20 --numIterations ${ITERATIONS} --lambda 0.065"); + //remove this + sh.exec("hadoop fs -ls ${WORK_DIR}/als/out >> /tmp/mahoutdebug"); //compute predictions against the probe set, measure the error assertRun("evaluateFactorization --output ${WORK_DIR}/als/rmse --input ${WORK_DIR}/dataset/probeSet/ " + "--userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp"); - + //compute recommendations assertRun("recommendfactorized --input ${WORK_DIR}/als/out/userRatings/ --output ${WORK_DIR}/recommendations " + "--userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ " + "--numRecommendations 6 --maxRating 5"); // check that error has been calculated - sh.exec("hadoop fs -test -e ${WORK_DIR}/als/rmse/rmse.txt"); assertEquals("${WORK_DIR}/als/rmse/rmse.txt does not exist", 0, sh.getRet()); // print the error sh.exec("hadoop fs -cat ${WORK_DIR}/als/rmse/rmse.txt"); @@ -162,8 +208,18 @@ public class TestMahoutExamples { assertEquals("${WORK_DIR}/recommendations/part-m-00000 does not exist", 0, sh.getRet()); } - // it's too much of a pain to use junit parameterized tests, so do it - // the simple way + /** + * Alternative to parameterized test: this is a test that is implemented by each + * individual clustering test. + * + * Explanation of clustering tests: + * + * Each of the below tests runs a different clustering algorithm against the same + * input data set, against synthesize "control" data. "Control data" is data that shows + * the time series performance of a process. For example, a cellphone company + * might want to run this to find which regions have decreasing performance over time (i.e. due to increased population), + * versus places which have cyclic performance (i.e. due to weather). + */ private void _clusterSyntheticControlData(String algorithm) { rmr("testdata"); sh.exec("hadoop fs -mkdir testdata", @@ -198,6 +254,9 @@ public class TestMahoutExamples { _clusterSyntheticControlData("meanshift"); } + /** + * Test the creation of topical clusters from raw lists words using LDA. + */ @Test(timeout=7200000L) public void testReutersLDA() { // where does lda.algorithm come in? @@ -233,7 +292,10 @@ mahout ldatopics \ -dt sequencefile"""); } - @Test(timeout=9000000L) + /** + * Note that this test doesnt work on some older mahout versions. + */ + @Test(timeout=9000000L) public void testBayesNewsgroupClassifier() { // put bayes-train-input and bayes-test-input in hdfs sh.exec("hadoop fs -mkdir ${WORK_DIR}/20news-vectors");
