Repository: bigtop
Updated Branches:
  refs/heads/master b4a7a7a47 -> 3b13a811a


BIGTOP-1287. Mahout smokes : Remove dirchlet/meanshift clustering.


Project: http://git-wip-us.apache.org/repos/asf/bigtop/repo
Commit: http://git-wip-us.apache.org/repos/asf/bigtop/commit/3b13a811
Tree: http://git-wip-us.apache.org/repos/asf/bigtop/tree/3b13a811
Diff: http://git-wip-us.apache.org/repos/asf/bigtop/diff/3b13a811

Branch: refs/heads/master
Commit: 3b13a811a8c6ad568a6a3a7b586be9f2fdf4e810
Parents: b4a7a7a
Author: jayunit100 <[email protected]>
Authored: Mon Apr 21 08:56:57 2014 -0400
Committer: jayunit100 <[email protected]>
Committed: Thu Jan 15 21:35:08 2015 -0500

----------------------------------------------------------------------
 .../mahout/smoke/TestMahoutExamples.groovy      | 268 +++++++++----------
 1 file changed, 129 insertions(+), 139 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/bigtop/blob/3b13a811/bigtop-tests/test-artifacts/mahout/src/main/groovy/org/apache/bigtop/itest/mahout/smoke/TestMahoutExamples.groovy
----------------------------------------------------------------------
diff --git 
a/bigtop-tests/test-artifacts/mahout/src/main/groovy/org/apache/bigtop/itest/mahout/smoke/TestMahoutExamples.groovy
 
b/bigtop-tests/test-artifacts/mahout/src/main/groovy/org/apache/bigtop/itest/mahout/smoke/TestMahoutExamples.groovy
index 9e50350..63f07be 100644
--- 
a/bigtop-tests/test-artifacts/mahout/src/main/groovy/org/apache/bigtop/itest/mahout/smoke/TestMahoutExamples.groovy
+++ 
b/bigtop-tests/test-artifacts/mahout/src/main/groovy/org/apache/bigtop/itest/mahout/smoke/TestMahoutExamples.groovy
@@ -1,20 +1,20 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+* <p>
+* http://www.apache.org/licenses/LICENSE-2.0
+* <p>
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
 package org.apache.bigtop.itest.mahout.smoke;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.assertEquals;
@@ -28,17 +28,17 @@ import org.apache.bigtop.itest.JarContent;
 import org.apache.bigtop.itest.shell.Shell;
 
 /**
- * Test Mahout examples shipped with the distribution.
- */
+* Test Mahout examples shipped with the distribution.
+*/
 public class TestMahoutExamples {
     public static final String TEMP_DIR = "/tmp/mahout.${(new 
Date().getTime())}";
     public static final String WORK_DIR = TEMP_DIR;
 
     /**
-     * If MAHOUT_HOME is supplied, use that as the executable.  Else, use
-     * mahout.  This eases the testing of tarball installations and other 
scenarios
-     * where possible more than one version of an ecosystem component is 
available.
-     */
+    * If MAHOUT_HOME is supplied, use that as the executable.  Else, use
+    * mahout.  This eases the testing of tarball installations and other 
scenarios
+    * where possible more than one version of an ecosystem component is 
available.
+    */
     public static String MAHOUT_HOME = System.getenv("MAHOUT_HOME") ;
     public static String MAHOUT = MAHOUT_HOME ? 
MAHOUT_HOME+"/bin/mahout":"mahout"
 
@@ -48,68 +48,68 @@ public class TestMahoutExamples {
 
     /**
     *  Mahout smokes rely on a lot of external files.  So we
-    *  modularize the downloads into a single function, so that 
-    *  the setup is easier to debug.  If any download results in a 
+    *  modularize the downloads into a single function, so that
+    *  the setup is easier to debug.  If any download results in a
     *  small file (i.e. due to 404 or 500 error), assertion will fail
-    *  before the smokes actually start. 
+    *  before the smokes actually start.
     */
     public static void download(){
 
-               //key value pairs : data file -> url that file resides on.  
-        def urlmap = [ 
-                        "20news-bydate.tar.gz":
-                                                       
"http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz"; ,
-                        
-                                               "reuters21578.tar.gz":
-                                                       
"http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz";,
-                        
-                                               "synthetic_control.data":
-                                                       
"http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data";,
-                        
-                                               "ml-1m.zip":
-                                                       
"http://files.grouplens.org/papers/ml-1m.zip";  
-                        ];
-        //For each url above, download it. 
+        //key value pairs : data file -&gt; url that file resides on.
+        def urlmap = [
+        "20news-bydate.tar.gz":
+        
"http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz"; ,
+
+        "reuters21578.tar.gz":
+        "http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz";,
+
+        "synthetic_control.data":
+        
"http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data";,
+
+        "ml-1m.zip":
+        "http://files.grouplens.org/papers/ml-1m.zip";
+        ];
+        //For each url above, download it.
         urlmap.each() {
-                f_name,loc -> 
-                        sh.exec("if [ ! -f ${download_dir}/${f_name} ]; then " 
+
-                                "curl ${loc} -o ${download_dir}/${f_name}; " +
-                                "fi");
-                File file = new File("${download_dir}/${f_name}");
-                
-                assertTrue("file "+ f_name + " at  "+loc + " len=" + 
file.length() + " is > 5k bytes", file.length() > 5000 );
+            f_name,loc -&gt;
+            sh.exec("if [ ! -f ${download_dir}/${f_name} ]; then " +
+            "curl ${loc} -o ${download_dir}/${f_name}; " +
+            "fi");
+            File file = new File("${download_dir}/${f_name}");
+
+            assertTrue("file "+ f_name + " at  "+loc + " len=" + file.length() 
+ " is &gt; 5k bytes", file.length() &gt; 5000 );
         }
 
     }
 
-       /**
-        * Individual tests (i.e. movie lens factorizer) will selectively copy 
this directory into the 
-        * distributed file system & then run tests against it (i.e. movie lens 
factorizer uses "fs -put" after
-        * formatting a csv file in the tmp dir).
-        */
+    /**
+    * Individual tests (i.e. movie lens factorizer) will selectively copy this 
directory into the
+    * distributed file system & then run tests against it (i.e. movie lens 
factorizer uses "fs -put" after
+    * formatting a csv file in the tmp dir).
+    */
     @BeforeClass
     public static void setUp() {
-        download(); 
+        download();
         
         // uncompress archives
         sh.exec("mkdir ${TEMP_DIR}",
-                "cd ${TEMP_DIR}",
+        "cd ${TEMP_DIR}",
         //Create news-date data dir :: input for classifier test
-               "mkdir 20news-bydate",
-                "cd 20news-bydate",
-               "tar xzf ${download_dir}/20news-bydate.tar.gz",
-                "cd ..",
-               //Create news-all data directory :: input for LDA test
-                "mkdir 20news-all",
-                "cp -R 20news-bydate/*/* 20news-all",
-                "mkdir reuters-sgm",
-                "cd reuters-sgm",
-                "tar xzf ${download_dir}/reuters21578.tar.gz",
-                "cd ..",
-               //Create movie lens data directory :: input data for movie 
recommender test
-                "mkdir movielens",
-                "cd movielens",
-                "unzip ${download_dir}/ml-1m.zip");
+        "mkdir 20news-bydate",
+        "cd 20news-bydate",
+        "tar xzf ${download_dir}/20news-bydate.tar.gz",
+        "cd ..",
+        //Create news-all data directory :: input for LDA test
+        "mkdir 20news-all",
+        "cp -R 20news-bydate/*/* 20news-all",
+        "mkdir reuters-sgm",
+        "cd reuters-sgm",
+        "tar xzf ${download_dir}/reuters21578.tar.gz",
+        "cd ..",
+        //Create movie lens data directory :: input data for movie recommender 
test
+        "mkdir movielens",
+        "cd movielens",
+        "unzip ${download_dir}/ml-1m.zip");
         assertEquals("Failed to uncompress archives", 0, sh.getRet());
         sh.exec("hadoop fs -mkdir ${WORK_DIR}");
         assertEquals("Unable to create work dir in HCFS", 0, sh.getRet());
@@ -117,22 +117,22 @@ public class TestMahoutExamples {
     }
 
     /**
-     * Run method that tests for 0 return code and logs the entire command.
-     */
+    * Run method that tests for 0 return code and logs the entire command.
+    */
     public void assertRun(String mahoutJob){
         final String cmd = MAHOUT+" "+mahoutJob;
 
-               //Cat the commands to a central file thats easy to tail.  
-               //TODO a simpler 
-               sh.exec("echo \""+cmd+"\" >> /var/log/mahout.smoke");
-               sh.exec(cmd);
+        //Cat the commands to a central file thats easy to tail.
+        //TODO a simpler
+        sh.exec("echo \""+cmd+"\" &gt;&gt; /var/log/mahout.smoke");
+        sh.exec(cmd);
         assertEquals("non-zero return! :::: "+cmd + " :::: out= " + sh.out + " 
:::: err= "+sh.err, 0, sh.getRet());
     }
 
     @AfterClass
     public static void tearDown() {
-               sh.exec("rm -rf ${TEMP_DIR}",
-                               "hadoop fs -rmr ${WORK_DIR}");
+        sh.exec("rm -rf ${TEMP_DIR}",
+        "hadoop fs -rmr ${WORK_DIR}");
     }
 
     private static void rmr(String path) {
@@ -148,54 +148,54 @@ public class TestMahoutExamples {
         sh.exec("mapred job -list | grep 'Total jobs:0'");
         if (sh.getRet() == 0) {
             sh.exec("for jobid in `mapred job -list | grep 'RUNNING' |awk 
'{print \$1}'`;",
-                    "do mapred job -kill \${jobid};",
-                    "done");
+            "do mapred job -kill \${jobid};",
+            "done");
         }
     }
 
-       //iterations for factorizer, original value was "10",
-       //on a small 4 node cluster, 2 iterations 
-       //should complete in about 5 minutes or so.
-    static final int ITERATIONS=2;  
-       
-       /**
-        * This is the full workflow for creating recommendations based on movie
-        * ratings including creating training/test data, ALS for training, 
evaluating
-        * the ALS, and then outputting final movie recommendations for users.
-        */
+    //iterations for factorizer, original value was "10",
+    //on a small 4 node cluster, 2 iterations
+    //should complete in about 5 minutes or so.
+    static final int ITERATIONS=2;
+
+    /**
+    * This is the full workflow for creating recommendations based on movie
+    * ratings including creating training/test data, ALS for training, 
evaluating
+    * the ALS, and then outputting final movie recommendations for users.
+    */
     @Test(timeout=12000000L)
     public void factorizeMovieLensRatings() {
         // convert ratings
-        sh.exec("cat ${TEMP_DIR}/movielens/ml-1m/ratings.dat |sed -e s/::/,/g| 
cut -d, -f1,2,3 > ${TEMP_DIR}/movielens/ratings.csv");
+        sh.exec("cat ${TEMP_DIR}/movielens/ml-1m/ratings.dat |sed -e s/::/,/g| 
cut -d, -f1,2,3 &gt; ${TEMP_DIR}/movielens/ratings.csv");
         assertEquals("Unexpected error from converting ratings", 0, 
sh.getRet());
 
         // put ratings in hdfs
         sh.exec("hadoop fs -mkdir ${WORK_DIR}/movielens",
-                "hadoop fs -put ${TEMP_DIR}/movielens/ratings.csv 
${WORK_DIR}/movielens/ratings.csv");
+        "hadoop fs -put ${TEMP_DIR}/movielens/ratings.csv 
${WORK_DIR}/movielens/ratings.csv");
         assertEquals("Unable to put movielens/ratings.csv in hdfs", 0, 
sh.getRet());
 
         //create a 90% percent training set and a 10% probe set
         assertRun("splitDataset --input ${WORK_DIR}/movielens/ratings.csv 
--output ${WORK_DIR}/dataset " +
-                "--trainingPercentage 0.9 --probePercentage 0.1 --tempDir 
${WORK_DIR}/dataset/tmp");
+        "--trainingPercentage 0.9 --probePercentage 0.1 --tempDir 
${WORK_DIR}/dataset/tmp");
 
-       //Default iterations was 10, but for simple smokes that most might run,
-       //2 iterations will confirm enough to move on. 
+        //Default iterations was 10, but for simple smokes that most might run,
+        //2 iterations will confirm enough to move on.
+
+        //run distributed ALS-WR to factorize the rating matrix based on the 
training set
         
-       //run distributed ALS-WR to factorize the rating matrix based on the 
training set
-         
         assertRun("parallelALS --input ${WORK_DIR}/dataset/trainingSet/ 
--output ${WORK_DIR}/als/out " +
-                "--tempDir ${WORK_DIR}/als/tmp --numFeatures 20 
--numIterations ${ITERATIONS} --lambda 0.065");
+        "--tempDir ${WORK_DIR}/als/tmp --numFeatures 20 --numIterations 
${ITERATIONS} --lambda 0.065");
 
-       //remove this
-        sh.exec("hadoop fs -ls ${WORK_DIR}/als/out >> /tmp/mahoutdebug");
+        //remove this
+        sh.exec("hadoop fs -ls ${WORK_DIR}/als/out &gt;&gt; /tmp/mahoutdebug");
         //compute predictions against the probe set, measure the error
         assertRun("evaluateFactorization --output ${WORK_DIR}/als/rmse --input 
${WORK_DIR}/dataset/probeSet/ " +
-                "--userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures 
${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp");
-       
+        "--userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures 
${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp");
+
         //compute recommendations
         assertRun("recommendfactorized --input 
${WORK_DIR}/als/out/userRatings/ --output ${WORK_DIR}/recommendations " +
-                "--userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures 
${WORK_DIR}/als/out/M/ " +
-                "--numRecommendations 6 --maxRating 5");
+        "--userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures 
${WORK_DIR}/als/out/M/ " +
+        "--numRecommendations 6 --maxRating 5");
 
         // check that error has been calculated
         assertEquals("${WORK_DIR}/als/rmse/rmse.txt does not exist", 0, 
sh.getRet());
@@ -208,22 +208,22 @@ public class TestMahoutExamples {
         assertEquals("${WORK_DIR}/recommendations/part-m-00000 does not 
exist", 0, sh.getRet());
     }
 
-       /**
-        * Alternative to parameterized test: this is a test that is 
implemented by each 
-        * individual clustering test.
-        * 
-        * Explanation of clustering tests:
-        * 
-        * Each of the below tests runs a different clustering algorithm 
against the same
-        * input data set, against synthesize "control" data.  "Control data" 
is data that shows
-        * the time series performance of a process.  For example, a cellphone 
company
-        * might want to run this to find which regions have decreasing 
performance over time (i.e. due to increased population), 
-        * versus places which have cyclic performance (i.e. due to weather).
-        */
+    /**
+    * Alternative to parameterized test: this is a test that is implemented by 
each
+    * individual clustering test.
+    *
+    * Explanation of clustering tests:
+    *
+    * Each of the below tests runs a different clustering algorithm against 
the same
+    * input data set, against synthesize "control" data.  "Control data" is 
data that shows
+    * the time series performance of a process.  For example, a cellphone 
company
+    * might want to run this to find which regions have decreasing performance 
over time (i.e. due to increased population),
+    * versus places which have cyclic performance (i.e. due to weather).
+    */
     private void _clusterSyntheticControlData(String algorithm) {
         rmr("testdata");
         sh.exec("hadoop fs -mkdir testdata",
-                "hadoop fs -put ${download_dir}/synthetic_control.data 
testdata");
+        "hadoop fs -put ${download_dir}/synthetic_control.data testdata");
         assertEquals("Unable to put data in hdfs", 0, sh.getRet());
         
assertRun("org.apache.mahout.clustering.syntheticcontrol.${algorithm}.Job");
         assertEquals("Unexpected error from running mahout", 0, sh.getRet());
@@ -244,19 +244,9 @@ public class TestMahoutExamples {
         _clusterSyntheticControlData("fuzzykmeans");
     }
 
-    @Test(timeout=900000L)
-    public void clusterControlDataWithDirichlet() {
-        _clusterSyntheticControlData("dirichlet");
-    }
-
-    @Test(timeout=900000L)
-    public void clusterControlDataWithMeanShift() {
-        _clusterSyntheticControlData("meanshift");
-    }
-
-       /**
-        * Test the creation of topical clusters from raw lists words using LDA.
-        */
+    /**
+    * Test the creation of topical clusters from raw lists words using LDA.
+    */
     @Test(timeout=7200000L)
     public void testReutersLDA() {
         // where does lda.algorithm come in?
@@ -268,12 +258,12 @@ public class TestMahoutExamples {
         assertRun("seqdirectory -i ${TEMP_DIR}/reuters-out -o 
${TEMP_DIR}/reuters-out-seqdir -c UTF-8 -chunk 5");
         assertEquals("Unexpected error from running mahout", 0, sh.getRet());
         /*
-         // reuters-out-seqdir exists on a local disk at this point,
-         // copy it to hdfs
-         rmr("${WORK_DIR}/reuters-out-seqdir");
-         sh.exec("hadoop fs -put ${TEMP_DIR}/reuters-out-seqdir 
${WORK_DIR}/reuters-out-seqdir");
-         assertEquals("Unable to put reuters-out-seqdir in hdfs", 0, 
sh.getRet());
-         */
+        // reuters-out-seqdir exists on a local disk at this point,
+        // copy it to hdfs
+        rmr("${WORK_DIR}/reuters-out-seqdir");
+        sh.exec("hadoop fs -put ${TEMP_DIR}/reuters-out-seqdir 
${WORK_DIR}/reuters-out-seqdir");
+        assertEquals("Unable to put reuters-out-seqdir in hdfs", 0, 
sh.getRet());
+        */
         assertRun("""seq2sparse \
 -i ${WORK_DIR}/reuters-out-seqdir/ \
 -o ${WORK_DIR}/reuters-out-seqdir-sparse-lda \
@@ -292,10 +282,10 @@ mahout ldatopics \
 -dt sequencefile""");
     }
 
-       /**
-        * Note that this test doesnt work on some older mahout versions.  
-        */
-       @Test(timeout=9000000L)
+    /**
+    * Note that this test doesnt work on some older mahout versions.
+    */
+    @Test(timeout=9000000L)
     public void testBayesNewsgroupClassifier() {
         // put bayes-train-input and bayes-test-input in hdfs
         sh.exec("hadoop fs -mkdir ${WORK_DIR}/20news-vectors");

Reply via email to