Added: websites/staging/mahout/trunk/content/users/mapreduce/classification/breiman-example.html ============================================================================== --- websites/staging/mahout/trunk/content/users/mapreduce/classification/breiman-example.html (added) +++ websites/staging/mahout/trunk/content/users/mapreduce/classification/breiman-example.html Thu Mar 19 21:21:45 2015 @@ -0,0 +1,331 @@ +<!DOCTYPE html> +<!-- + + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> + <title>Apache Mahout: Scalable machine learning and data mining</title> + <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> + <meta name="Distribution" content="Global"> + <meta name="Robots" content="index,follow"> + <meta name="keywords" content="apache, apache hadoop, apache lucene, + business data mining, cluster analysis, + collaborative filtering, data extraction, data filtering, data framework, data integration, + data matching, data mining, data mining algorithms, data mining analysis, data mining data, + data mining introduction, data mining software, + data mining techniques, data representation, data set, datamining, + feature extraction, fuzzy k means, genetic algorithm, hadoop, + hierarchical clustering, high dimensional, introduction to data mining, kmeans, + knowledge discovery, learning approach, learning approaches, learning methods, + learning techniques, lucene, machine learning, machine translation, mahout apache, + mahout taste, map reduce hadoop, mining data, mining methods, naive bayes, + natural language processing, + supervised, text mining, time series data, unsupervised, web data mining"> + <link rel="shortcut icon" type="image/x-icon" href="http://mahout.apache.org/images/favicon.ico"> + <script type="text/javascript" src="/js/prototype.js"></script> + <script type="text/javascript" src="/js/effects.js"></script> + <script type="text/javascript" src="/js/search.js"></script> + <script type="text/javascript" src="/js/slides.js"></script> + + <link href="/css/bootstrap.min.css" rel="stylesheet" media="screen"> + <link href="/css/bootstrap-responsive.css" rel="stylesheet"> + <link rel="stylesheet" href="/css/global.css" type="text/css"> + + <!-- mathJax stuff -- use `\(...\)` for inline style math in markdown --> + <script type="text/x-mathjax-config"> + MathJax.Hub.Config({ + tex2jax: { + skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] + } + }); + MathJax.Hub.Queue(function() { + var all = MathJax.Hub.getAllJax(), i; + for(i = 0; i < all.length; i += 1) { + all[i].SourceElement().parentNode.className += ' has-jax'; + } + }); + </script> + <script type="text/javascript"> + var mathjax = document.createElement('script'); + mathjax.type = 'text/javascript'; + mathjax.async = true; + + mathjax.src = ('https:' == document.location.protocol) ? + 'https://c328740.ssl.cf1.rackcdn.com/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML' : + 'http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'; + + var s = document.getElementsByTagName('script')[0]; + s.parentNode.insertBefore(mathjax, s); + </script> +</head> + +<body id="home" data-twttr-rendered="true"> + <div id="wrap"> + <div id="header"> + <div id="logo"><a href="/overview.html"></a></div> + <div id="search"> + <form id="search-form" action="http://www.google.com/search" method="get" class="navbar-search pull-right"> + <input value="http://mahout.apache.org" name="sitesearch" type="hidden"> + <input class="search-query" name="q" id="query" type="text"> + <input id="submission" type="image" src="/images/mahout-lupe.png" alt="Search" /> + </form> + </div> + + <div class="navbar navbar-inverse" style="position:absolute;top:133px;padding-right:0px;padding-left:0px;"> + <div class="navbar-inner" style="border: none; background: #999; border: none; border-radius: 0px;"> + <div class="container"> + <button type="button" class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse"> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + </button> + <!-- <a class="brand" href="#">Apache Community Development Project</a> --> + <div class="nav-collapse collapse"> + <ul class="nav"> + <li><a href="/">Home</a></li> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">General<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li><a href="/general/downloads.html">Downloads</a> + <li><a href="/general/who-we-are.html">Who we are</a> + <li><a href="/general/mailing-lists,-irc-and-archives.html">Mailing Lists</a> + <li><a href="/general/release-notes.html">Release Notes</a> + <li><a href="/general/books-tutorials-and-talks.html">Books, Tutorials, Talks</a></li> + <li><a href="/general/powered-by-mahout.html">Powered By Mahout</a> + <li><a href="/general/professional-support.html">Professional Support</a> + <li class="divider"></li> + <li class="nav-header">Resources</li> + <li><a href="/general/reference-reading.html">Reference Reading</a> + <li><a href="/general/faq.html">FAQ</a> + <li class="divider"></li> + <li class="nav-header">Legal</li> + <li><a href="http://www.apache.org/licenses/">License</a></li> + <li><a href="http://www.apache.org/security/">Security</a></li> + <li><a href="/general/privacy-policy.html">Privacy Policy</a> + </ul> + </li> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Developers<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li><a href="/developers/developer-resources.html">Developer resources</a></li> + <li><a href="/developers/version-control.html">Version control</a></li> + <li><a href="/developers/buildingmahout.html">Build from source</a></li> + <li><a href="/developers/issue-tracker.html">Issue tracker</a></li> + <li><a href="https://builds.apache.org/job/Mahout-Quality/" target="_blank">Code quality reports</a></li> + <li class="divider"></li> + <li class="nav-header">Contributions</li> + <li><a href="/developers/how-to-contribute.html">How to contribute</a></li> + <li><a href="/developers/how-to-become-a-committer.html">How to become a committer</a></li> + <li><a href="/developers/gsoc.html">GSoC</a></li> + <li class="divider"></li> + <li class="nav-header">For committers</li> + <li><a href="/developers/how-to-update-the-website.html">How to update the website</a></li> + <li><a href="/developers/patch-check-list.html">Patch check list</a></li> + <li><a href="/developers/github.html">Handling Github PRs</a></li> + <li><a href="/developers/how-to-release.html">How to release</a></li> + <li><a href="/developers/thirdparty-dependencies.html">Third party dependencies</a></li> + </ul> + </li> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Basics<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li><a href="/users/basics/algorithms.html">List of algorithms</a> + <li><a href="/users/basics/quickstart.html">Quickstart</a> + <li class="divider"></li> + <li class="nav-header">Working with text</li> + <li><a href="/users/basics/creating-vectors-from-text.html">Creating vectors from text</a> + <li><a href="/users/basics/collocations.html">Collocations</a> + <li class="divider"></li> + <li class="nav-header">Dimensionality reduction</li> + <li><a href="/users/dim-reduction/dimensional-reduction.html">Singular Value Decomposition</a></li> + <li><a href="/users/dim-reduction/ssvd.html">Stochastic SVD</a></li> + <li class="divider"></li> + <li class="nav-header">Topic Models</li> + <li><a href="/users/clustering/latent-dirichlet-allocation.html">Latent Dirichlet Allocation</a></li> + </ul> + </li> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Spark<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li><a href="/users/sparkbindings/home.html">Scala & Spark Bindings Overview</a></li> + <li><a href="/users/sparkbindings/play-with-shell.html">Playing with Mahout's Spark Shell</a></li> + <li class="divider"></li> + <li><a href="/users/sparkbindings/faq.html">FAQ</a></li> + </ul> + </li> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Classification<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li><a href="/users/mapreduce/classification/bayesian.html">Naive Bayes</a></li> + <li><a href="/users/mapreduce/classification/hidden-markov-models.html">Hidden Markov Models</a></li> + <li><a href="/users/mapreduce/classification/logistic-regression.html">Logistic Regression</a></li> + <li><a href="/users/mapreduce/classification/partial-implementation.html">Random Forest</a></li> + + <li class="divider"></li> + <li class="nav-header">Examples</li> + <li><a href="/users/mapreduce/classification/breiman-example.html">Breiman example</a></li> + <li><a href="/users/mapreduce/classification/twenty-newsgroups.html">20 newsgroups example</a></li> + </ul></li> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Clustering<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li><a href="/users/mapreduce/clustering/k-means-clustering.html">k-Means</a></li> + <li><a href="/users/mapreduce/clustering/canopy-clustering.html">Canopy</a></li> + <li><a href="/users/mapreduce/clustering/fuzzy-k-means.html">Fuzzy k-Means</a></li> + <li><a href="/users/mapreduce/clustering/streaming-k-means.html">Streaming KMeans</a></li> + <li><a href="/users/mapreduce/clustering/spectral-clustering.html">Spectral Clustering</a></li> + <li class="divider"></li> + <li class="nav-header">Commandline usage</li> + <li><a href="/users/mapreduce/clustering/k-means-commandline.html">Options for k-Means</a></li> + <li><a href="/users/mapreduce/clustering/canopy-commandline.html">Options for Canopy</a></li> + <li><a href="/users/mapreduce/clustering/fuzzy-k-means-commandline.html">Options for Fuzzy k-Means</a></li> + <li class="divider"></li> + <li class="nav-header">Examples</li> + <li><a href="/users/mapreduce/clustering/clustering-of-synthetic-control-data.html">Synthetic data</a></li> + <li class="divider"></li> + <li class="nav-header">Post processing</li> + <li><a href="/users/mapreduce/clustering/cluster-dumper.html">Cluster Dumper tool</a></li> + <li><a href="/users/mapreduce/clustering/visualizing-sample-clusters.html">Cluster visualisation</a></li> + </ul></li> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Recommendations<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li><a href="/users/mapreduce/recommender/quickstart.html">Quickstart</a></li> + <li><a href="/users/mapreduce/recommender/recommender-first-timer-faq.html">First Timer FAQ</a></li> + <li><a href="/users/mapreduce/recommender/userbased-5-minutes.html">A user-based recommender <br/>in 5 minutes</a></li> + <li><a href="/users/mapreduce/recommender/matrix-factorization.html">Matrix factorization-based<br/> recommenders</a></li> + <li><a href="/users/mapreduce/recommender/recommender-documentation.html">Overview</a></li> + <li class="divider"></li> + <li class="nav-header">Hadoop</li> + <li><a href="/users/mapreduce/recommender/intro-itembased-hadoop.html">Intro to item-based recommendations<br/> with Hadoop</a></li> + <li><a href="/users/mapreduce/recommender/intro-als-hadoop.html">Intro to ALS recommendations<br/> with Hadoop</a></li> + <li class="nav-header">Spark</li> + <li><a href="/users/mapreduce/recommender/intro-cooccurrence-spark.html">Intro to cooccurrence-based<br/> recommendations with Spark</a></li> + </ul> + </li> + </ul> + </div><!--/.nav-collapse --> + </div> + </div> + </div> + +</div> + + <div id="sidebar"> + <div id="sidebar-wrap"> + <h2>Twitter</h2> + <ul class="sidemenu"> + <li> +<a class="twitter-timeline" href="https://twitter.com/ApacheMahout" data-widget-id="422861673444028416">Tweets by @ApacheMahout</a> +<script>!function(d,s,id){var js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+"://platform.twitter.com/widgets.js";fjs.parentNode.insertBefore(js,fjs);}}(document,"script","twitter-wjs");</script> +</li> + </ul> + <h2>Apache Software Foundation</h2> + <ul class="sidemenu"> + <li><a href="http://www.apache.org/foundation/how-it-works.html">How the ASF works</a></li> + <li><a href="http://www.apache.org/foundation/getinvolved.html">Get Involved</a></li> + <li><a href="http://www.apache.org/dev/">Developer Resources</a></li> + <li><a href="http://www.apache.org/foundation/sponsorship.html">Sponsorship</a></li> + <li><a href="http://www.apache.org/foundation/thanks.html">Thanks</a></li> + </ul> + <h2>Related Projects</h2> + <ul class="sidemenu"> + <li><a href="http://lucene.apache.org/">Lucene</a></li> + <li><a href="http://hadoop.apache.org/">Hadoop</a></li> + </ul> + </div> +</div> + + <div id="content-wrap" class="clearfix"> + <div id="main"> + <h1 id="breiman-example">Breiman Example</h1> +<h4 id="introduction">Introduction</h4> +<p>This page describes how to run the Breiman example, which implements the test procedure described in <a href="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.23.3999&rep=rep1&type=pdf">Leo Breiman's paper</a>. The basic algorithm is as follows :</p> +<ul> +<li>repeat <em>I</em> iterations</li> +<li>in each iteration do</li> +<li>keep 10% of the dataset apart as a testing set </li> +<li>build two forests using the training set, one with <em>m = int(log2(M) + 1)</em> (called Random-Input) and one with <em>m = 1</em> (called Single-Input)</li> +<li>choose the forest that gave the lowest oob error estimation to compute +the test set error</li> +<li>compute the test set error using the Single Input Forest (test error), +this demonstrates that even with <em>m = 1</em>, Decision Forests give comparable +results to greater values of <em>m</em></li> +<li>compute the mean testset error using every tree of the chosen forest +(tree error). This should indicate how well a single Decision Tree performs</li> +<li>compute the mean test error for all iterations</li> +<li>compute the mean tree error for all iterations</li> +</ul> +<h4 id="running-the-example">Running the Example</h4> +<p>The current implementation is compatible with the <a href="http://archive.ics.uci.edu/ml/">UCI repository</a> file format. We'll show how to run this example on two datasets:</p> +<p>First, we deal with <a href="http://archive.ics.uci.edu/ml/datasets/Glass+Identification">Glass Identification</a>: download the <a href="http://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data">dataset</a> file called <strong>glass.data</strong> and store it onto your local machine. Next, we must generate the descriptor file <strong>glass.info</strong> for this dataset with the following command:</p> +<div class="codehilite"><pre><span class="n">bin</span><span class="o">/</span><span class="n">mahout</span> <span class="n">org</span><span class="p">.</span><span class="n">apache</span><span class="p">.</span><span class="n">mahout</span><span class="p">.</span><span class="n">classifier</span><span class="p">.</span><span class="n">df</span><span class="p">.</span><span class="n">tools</span><span class="p">.</span><span class="n">Describe</span> <span class="o">-</span><span class="n">p</span> <span class="o">/</span><span class="n">path</span><span class="o">/</span><span class="n">to</span><span class="o">/</span><span class="n">glass</span><span class="p">.</span><span class="n">data</span> <span class="o">-</span><span class="n">f</span> <span class="o">/</span><span class="n">path</span><span class="o">/</span><span class="n">to</span><span class="o">/</span><span class="n">glass</span><span class="p">.</span><span class="n">info</span> <span class="o">-</span><span class= "n">d</span> <span class="n">I</span> 9 <span class="n">N</span> <span class="n">L</span> +</pre></div> + + +<p>Substitute <em>/path/to/</em> with the folder where you downloaded the dataset, the argument "I 9 N L" indicates the nature of the variables. Here it means 1 +ignored (I) attribute, followed by 9 numerical(N) attributes, followed by +the label (L).</p> +<p>Finally, we build and evaluate our random forest classifier as follows:</p> +<div class="codehilite"><pre><span class="n">bin</span><span class="o">/</span><span class="n">mahout</span> <span class="n">org</span><span class="p">.</span><span class="n">apache</span><span class="p">.</span><span class="n">mahout</span><span class="p">.</span><span class="n">classifier</span><span class="p">.</span><span class="n">df</span><span class="p">.</span><span class="n">BreimanExample</span> <span class="o">-</span><span class="n">d</span> <span class="o">/</span><span class="n">path</span><span class="o">/</span><span class="n">to</span><span class="o">/</span><span class="n">glass</span><span class="p">.</span><span class="n">data</span> <span class="o">-</span><span class="n">ds</span> <span class="o">/</span><span class="n">path</span><span class="o">/</span><span class="n">to</span><span class="o">/</span><span class="n">glass</span><span class="p">.</span><span class="n">info</span> <span class="o">-</span><span class="nb">i</span> 10 <span class="o">-</span><spa n class="n">t</span> 100 +</pre></div> + + +<p>which builds 100 trees (-t argument) and repeats the test 10 iterations (-i +argument) </p> +<p>The example outputs the following results:</p> +<ul> +<li>Selection error: mean test error for the selected forest on all iterations</li> +<li>Single Input error: mean test error for the single input forest on all +iterations</li> +<li>One Tree error: mean single tree error on all iterations</li> +<li>Mean Random Input Time: mean build time for random input forests on all +iterations</li> +<li>Mean Single Input Time: mean build time for single input forests on all +iterations</li> +</ul> +<p>We can repeat this for a <a href="http://archive.ics.uci.edu/ml/datasets/Connectionist+Bench+%28Sonar,+Mines+vs.+Rocks%29">Sonar</a> usecase: download the <a href="http://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data">dataset</a> file called <strong>sonar.all-data</strong> and store it onto your local machine. Generate the descriptor file <strong>sonar.info</strong> for this dataset with the following command:</p> +<div class="codehilite"><pre><span class="n">bin</span><span class="o">/</span><span class="n">mahout</span> <span class="n">org</span><span class="p">.</span><span class="n">apache</span><span class="p">.</span><span class="n">mahout</span><span class="p">.</span><span class="n">classifier</span><span class="p">.</span><span class="n">df</span><span class="p">.</span><span class="n">tools</span><span class="p">.</span><span class="n">Describe</span> <span class="o">-</span><span class="n">p</span> <span class="o">/</span><span class="n">path</span><span class="o">/</span><span class="n">to</span><span class="o">/</span><span class="n">sonar</span><span class="p">.</span><span class="n">all</span><span class="o">-</span><span class="n">data</span> <span class="o">-</span><span class="n">f</span> <span class="o">/</span><span class="n">path</span><span class="o">/</span><span class="n">to</span><span class="o">/</span><span class="n">sonar</span><span class="p">.</span><span class="n ">info</span> <span class="o">-</span><span class="n">d</span> 60 <span class="n">N</span> <span class="n">L</span> +</pre></div> + + +<p>The argument "60 N L" means 60 numerical(N) attributes, followed by the label (L). Analogous to the previous case, we run the evaluation as follows:</p> +<div class="codehilite"><pre><span class="n">bin</span><span class="o">/</span><span class="n">mahout</span> <span class="n">org</span><span class="p">.</span><span class="n">apache</span><span class="p">.</span><span class="n">mahout</span><span class="p">.</span><span class="n">classifier</span><span class="p">.</span><span class="n">df</span><span class="p">.</span><span class="n">BreimanExample</span> <span class="o">-</span><span class="n">d</span> <span class="o">/</span><span class="n">path</span><span class="o">/</span><span class="n">to</span><span class="o">/</span><span class="n">sonar</span><span class="p">.</span><span class="n">all</span><span class="o">-</span><span class="n">data</span> <span class="o">-</span><span class="n">ds</span> <span class="o">/</span><span class="n">path</span><span class="o">/</span><span class="n">to</span><span class="o">/</span><span class="n">sonar</span><span class="p">.</span><span class="n">info</span> <span class="o">-</span><span c lass="nb">i</span> 10 <span class="o">-</span><span class="n">t</span> 100 +</pre></div> + </div> + </div> +</div> + <footer class="footer" align="center"> + <div class="container"> + <p> + Copyright © 2014 The Apache Software Foundation, Licensed under + the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>. + <br /> + Apache and the Apache feather logos are trademarks of The Apache Software Foundation. + </p> + </div> + </footer> + + <script src="/js/jquery-1.9.1.min.js"></script> + <script src="/js/bootstrap.min.js"></script> + <script> + (function() { + var cx = '012254517474945470291:vhsfv7eokdc'; + var gcse = document.createElement('script'); + gcse.type = 'text/javascript'; + gcse.async = true; + gcse.src = (document.location.protocol == 'https:' ? 'https:' : 'http:') + + '//www.google.com/cse/cse.js?cx=' + cx; + var s = document.getElementsByTagName('script')[0]; + s.parentNode.insertBefore(gcse, s); + })(); + </script> +</body> +</html>
Added: websites/staging/mahout/trunk/content/users/mapreduce/classification/class-discovery.html ============================================================================== --- websites/staging/mahout/trunk/content/users/mapreduce/classification/class-discovery.html (added) +++ websites/staging/mahout/trunk/content/users/mapreduce/classification/class-discovery.html Thu Mar 19 21:21:45 2015 @@ -0,0 +1,413 @@ +<!DOCTYPE html> +<!-- + + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> + <title>Apache Mahout: Scalable machine learning and data mining</title> + <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> + <meta name="Distribution" content="Global"> + <meta name="Robots" content="index,follow"> + <meta name="keywords" content="apache, apache hadoop, apache lucene, + business data mining, cluster analysis, + collaborative filtering, data extraction, data filtering, data framework, data integration, + data matching, data mining, data mining algorithms, data mining analysis, data mining data, + data mining introduction, data mining software, + data mining techniques, data representation, data set, datamining, + feature extraction, fuzzy k means, genetic algorithm, hadoop, + hierarchical clustering, high dimensional, introduction to data mining, kmeans, + knowledge discovery, learning approach, learning approaches, learning methods, + learning techniques, lucene, machine learning, machine translation, mahout apache, + mahout taste, map reduce hadoop, mining data, mining methods, naive bayes, + natural language processing, + supervised, text mining, time series data, unsupervised, web data mining"> + <link rel="shortcut icon" type="image/x-icon" href="http://mahout.apache.org/images/favicon.ico"> + <script type="text/javascript" src="/js/prototype.js"></script> + <script type="text/javascript" src="/js/effects.js"></script> + <script type="text/javascript" src="/js/search.js"></script> + <script type="text/javascript" src="/js/slides.js"></script> + + <link href="/css/bootstrap.min.css" rel="stylesheet" media="screen"> + <link href="/css/bootstrap-responsive.css" rel="stylesheet"> + <link rel="stylesheet" href="/css/global.css" type="text/css"> + + <!-- mathJax stuff -- use `\(...\)` for inline style math in markdown --> + <script type="text/x-mathjax-config"> + MathJax.Hub.Config({ + tex2jax: { + skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] + } + }); + MathJax.Hub.Queue(function() { + var all = MathJax.Hub.getAllJax(), i; + for(i = 0; i < all.length; i += 1) { + all[i].SourceElement().parentNode.className += ' has-jax'; + } + }); + </script> + <script type="text/javascript"> + var mathjax = document.createElement('script'); + mathjax.type = 'text/javascript'; + mathjax.async = true; + + mathjax.src = ('https:' == document.location.protocol) ? + 'https://c328740.ssl.cf1.rackcdn.com/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML' : + 'http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'; + + var s = document.getElementsByTagName('script')[0]; + s.parentNode.insertBefore(mathjax, s); + </script> +</head> + +<body id="home" data-twttr-rendered="true"> + <div id="wrap"> + <div id="header"> + <div id="logo"><a href="/overview.html"></a></div> + <div id="search"> + <form id="search-form" action="http://www.google.com/search" method="get" class="navbar-search pull-right"> + <input value="http://mahout.apache.org" name="sitesearch" type="hidden"> + <input class="search-query" name="q" id="query" type="text"> + <input id="submission" type="image" src="/images/mahout-lupe.png" alt="Search" /> + </form> + </div> + + <div class="navbar navbar-inverse" style="position:absolute;top:133px;padding-right:0px;padding-left:0px;"> + <div class="navbar-inner" style="border: none; background: #999; border: none; border-radius: 0px;"> + <div class="container"> + <button type="button" class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse"> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + </button> + <!-- <a class="brand" href="#">Apache Community Development Project</a> --> + <div class="nav-collapse collapse"> + <ul class="nav"> + <li><a href="/">Home</a></li> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">General<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li><a href="/general/downloads.html">Downloads</a> + <li><a href="/general/who-we-are.html">Who we are</a> + <li><a href="/general/mailing-lists,-irc-and-archives.html">Mailing Lists</a> + <li><a href="/general/release-notes.html">Release Notes</a> + <li><a href="/general/books-tutorials-and-talks.html">Books, Tutorials, Talks</a></li> + <li><a href="/general/powered-by-mahout.html">Powered By Mahout</a> + <li><a href="/general/professional-support.html">Professional Support</a> + <li class="divider"></li> + <li class="nav-header">Resources</li> + <li><a href="/general/reference-reading.html">Reference Reading</a> + <li><a href="/general/faq.html">FAQ</a> + <li class="divider"></li> + <li class="nav-header">Legal</li> + <li><a href="http://www.apache.org/licenses/">License</a></li> + <li><a href="http://www.apache.org/security/">Security</a></li> + <li><a href="/general/privacy-policy.html">Privacy Policy</a> + </ul> + </li> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Developers<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li><a href="/developers/developer-resources.html">Developer resources</a></li> + <li><a href="/developers/version-control.html">Version control</a></li> + <li><a href="/developers/buildingmahout.html">Build from source</a></li> + <li><a href="/developers/issue-tracker.html">Issue tracker</a></li> + <li><a href="https://builds.apache.org/job/Mahout-Quality/" target="_blank">Code quality reports</a></li> + <li class="divider"></li> + <li class="nav-header">Contributions</li> + <li><a href="/developers/how-to-contribute.html">How to contribute</a></li> + <li><a href="/developers/how-to-become-a-committer.html">How to become a committer</a></li> + <li><a href="/developers/gsoc.html">GSoC</a></li> + <li class="divider"></li> + <li class="nav-header">For committers</li> + <li><a href="/developers/how-to-update-the-website.html">How to update the website</a></li> + <li><a href="/developers/patch-check-list.html">Patch check list</a></li> + <li><a href="/developers/github.html">Handling Github PRs</a></li> + <li><a href="/developers/how-to-release.html">How to release</a></li> + <li><a href="/developers/thirdparty-dependencies.html">Third party dependencies</a></li> + </ul> + </li> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Basics<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li><a href="/users/basics/algorithms.html">List of algorithms</a> + <li><a href="/users/basics/quickstart.html">Quickstart</a> + <li class="divider"></li> + <li class="nav-header">Working with text</li> + <li><a href="/users/basics/creating-vectors-from-text.html">Creating vectors from text</a> + <li><a href="/users/basics/collocations.html">Collocations</a> + <li class="divider"></li> + <li class="nav-header">Dimensionality reduction</li> + <li><a href="/users/dim-reduction/dimensional-reduction.html">Singular Value Decomposition</a></li> + <li><a href="/users/dim-reduction/ssvd.html">Stochastic SVD</a></li> + <li class="divider"></li> + <li class="nav-header">Topic Models</li> + <li><a href="/users/clustering/latent-dirichlet-allocation.html">Latent Dirichlet Allocation</a></li> + </ul> + </li> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Spark<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li><a href="/users/sparkbindings/home.html">Scala & Spark Bindings Overview</a></li> + <li><a href="/users/sparkbindings/play-with-shell.html">Playing with Mahout's Spark Shell</a></li> + <li class="divider"></li> + <li><a href="/users/sparkbindings/faq.html">FAQ</a></li> + </ul> + </li> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Classification<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li><a href="/users/mapreduce/classification/bayesian.html">Naive Bayes</a></li> + <li><a href="/users/mapreduce/classification/hidden-markov-models.html">Hidden Markov Models</a></li> + <li><a href="/users/mapreduce/classification/logistic-regression.html">Logistic Regression</a></li> + <li><a href="/users/mapreduce/classification/partial-implementation.html">Random Forest</a></li> + + <li class="divider"></li> + <li class="nav-header">Examples</li> + <li><a href="/users/mapreduce/classification/breiman-example.html">Breiman example</a></li> + <li><a href="/users/mapreduce/classification/twenty-newsgroups.html">20 newsgroups example</a></li> + </ul></li> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Clustering<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li><a href="/users/mapreduce/clustering/k-means-clustering.html">k-Means</a></li> + <li><a href="/users/mapreduce/clustering/canopy-clustering.html">Canopy</a></li> + <li><a href="/users/mapreduce/clustering/fuzzy-k-means.html">Fuzzy k-Means</a></li> + <li><a href="/users/mapreduce/clustering/streaming-k-means.html">Streaming KMeans</a></li> + <li><a href="/users/mapreduce/clustering/spectral-clustering.html">Spectral Clustering</a></li> + <li class="divider"></li> + <li class="nav-header">Commandline usage</li> + <li><a href="/users/mapreduce/clustering/k-means-commandline.html">Options for k-Means</a></li> + <li><a href="/users/mapreduce/clustering/canopy-commandline.html">Options for Canopy</a></li> + <li><a href="/users/mapreduce/clustering/fuzzy-k-means-commandline.html">Options for Fuzzy k-Means</a></li> + <li class="divider"></li> + <li class="nav-header">Examples</li> + <li><a href="/users/mapreduce/clustering/clustering-of-synthetic-control-data.html">Synthetic data</a></li> + <li class="divider"></li> + <li class="nav-header">Post processing</li> + <li><a href="/users/mapreduce/clustering/cluster-dumper.html">Cluster Dumper tool</a></li> + <li><a href="/users/mapreduce/clustering/visualizing-sample-clusters.html">Cluster visualisation</a></li> + </ul></li> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Recommendations<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li><a href="/users/mapreduce/recommender/quickstart.html">Quickstart</a></li> + <li><a href="/users/mapreduce/recommender/recommender-first-timer-faq.html">First Timer FAQ</a></li> + <li><a href="/users/mapreduce/recommender/userbased-5-minutes.html">A user-based recommender <br/>in 5 minutes</a></li> + <li><a href="/users/mapreduce/recommender/matrix-factorization.html">Matrix factorization-based<br/> recommenders</a></li> + <li><a href="/users/mapreduce/recommender/recommender-documentation.html">Overview</a></li> + <li class="divider"></li> + <li class="nav-header">Hadoop</li> + <li><a href="/users/mapreduce/recommender/intro-itembased-hadoop.html">Intro to item-based recommendations<br/> with Hadoop</a></li> + <li><a href="/users/mapreduce/recommender/intro-als-hadoop.html">Intro to ALS recommendations<br/> with Hadoop</a></li> + <li class="nav-header">Spark</li> + <li><a href="/users/mapreduce/recommender/intro-cooccurrence-spark.html">Intro to cooccurrence-based<br/> recommendations with Spark</a></li> + </ul> + </li> + </ul> + </div><!--/.nav-collapse --> + </div> + </div> + </div> + +</div> + + <div id="sidebar"> + <div id="sidebar-wrap"> + <h2>Twitter</h2> + <ul class="sidemenu"> + <li> +<a class="twitter-timeline" href="https://twitter.com/ApacheMahout" data-widget-id="422861673444028416">Tweets by @ApacheMahout</a> +<script>!function(d,s,id){var js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+"://platform.twitter.com/widgets.js";fjs.parentNode.insertBefore(js,fjs);}}(document,"script","twitter-wjs");</script> +</li> + </ul> + <h2>Apache Software Foundation</h2> + <ul class="sidemenu"> + <li><a href="http://www.apache.org/foundation/how-it-works.html">How the ASF works</a></li> + <li><a href="http://www.apache.org/foundation/getinvolved.html">Get Involved</a></li> + <li><a href="http://www.apache.org/dev/">Developer Resources</a></li> + <li><a href="http://www.apache.org/foundation/sponsorship.html">Sponsorship</a></li> + <li><a href="http://www.apache.org/foundation/thanks.html">Thanks</a></li> + </ul> + <h2>Related Projects</h2> + <ul class="sidemenu"> + <li><a href="http://lucene.apache.org/">Lucene</a></li> + <li><a href="http://hadoop.apache.org/">Hadoop</a></li> + </ul> + </div> +</div> + + <div id="content-wrap" class="clearfix"> + <div id="main"> + <p><a name="ClassDiscovery-ClassDiscovery"></a></p> +<h1 id="class-discovery">Class Discovery</h1> +<p>See http://www.cs.bham.ac.uk/~wbl/biblio/gecco1999/GP-417.pdf</p> +<p>CDGA uses a Genetic Algorithm to discover a classification rule for a given +dataset. +A dataset can be seen as a table:</p> +<table> +<tr><th> </th><th>attribute 1</th><th>attribute 2</th><th>...</th><th>attribute N</th></tr> +<tr><td>row 1</td><td>value1</td><td>value2</td><td>...</td><td>valueN</td></tr> +<tr><td>row 2</td><td>value1</td><td>value2</td><td>...</td><td>valueN</td></tr> +<tr><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td></tr> +<tr><td>row M</td><td>value1</td><td>value2</td><td>...</td><td>valueN</td></tr> +</table> + +<p>An attribute can be numerical, for example a "temperature" attribute, or +categorical, for example a "color" attribute. For classification purposes, +one of the categorical attributes is designated as a <em>label</em>, which means +that its value defines the <em>class</em> of the rows. +A classification rule can be represented as follows: +<table> +<tr><th> </th><th>attribute 1</th><th>attribute 2</th><th>...</th><th>attribute N</th></tr> +<tr><td>weight</td><td>w1</td><td>w2</td><td>...</td><td>wN</td></tr> +<tr><td>operator</td><td>op1</td><td>op2</td><td>...</td><td>opN</td></tr> +<tr><td>value</td><td>value1</td><td>value2</td><td>...</td><td>valueN</td></tr> +</table></p> +<p>For a given <em>target</em> class and a weight <em>threshold</em>, the classification +rule can be read :</p> +<div class="codehilite"><pre><span class="k">for</span> <span class="n">each</span> <span class="n">row</span> <span class="n">of</span> <span class="n">the</span> <span class="n">dataset</span> + <span class="k">if</span> <span class="p">(</span><span class="n">rule</span><span class="p">.</span><span class="n">w1</span> <span class="o"><</span> <span class="n">threshold</span> <span class="o">||</span> <span class="p">(</span><span class="n">rule</span><span class="p">.</span><span class="n">w1</span> <span class="o">></span><span class="p">=</span> <span class="n">threshold</span> <span class="o">&&</span> <span class="n">row</span><span class="p">.</span><span class="n">value1</span> <span class="n">rule</span><span class="p">.</span><span class="n">op1</span> +</pre></div> + + +<p>rule.value1)) && + (rule.w2 < threshold || (rule.w2 >= threshold && row.value2 rule.op2 +rule.value2)) && + ... + (rule.wN < threshold || (rule.wN >= threshold && row.valueN rule.opN +rule.valueN)) then + row is part of the target class</p> +<p><em>Important:</em> The label attribute is not evaluated by the rule.</p> +<p>The threshold parameter allows some conditions of the rule to be skipped if +their weight is too small. The operators available depend on the attribute +types: +<em> for a numerical attributes, the available operators are '<' and '>=' +</em> for categorical attributes, the available operators are '!=' and '=='</p> +<p>The "threshold" and "target" are user defined parameters, and because the +label is always a categorical attribute, the target is the (zero based) +index of the class label value in all the possible values of the label. For +example, if the label attribute can have the following values (blue, brown, +green), then a target of 1 means the "blue" class.</p> +<p>For example, we have the following dataset (the label attribute is "Eyes +Color"): +<table> +<tr><th> </th><th>Age</th><th>Eyes Color</th><th>Hair Color</th></tr> +<tr><td>row 1</td><td>16</td><td>brown</td><td>dark</td></tr> +<tr><td>row 2</td><td>25</td><td>green</td><td>light</td></tr> +<tr><td>row 3</td><td>12</td><td>blue</td><td>light</td></tr> +and a classification rule: +<tr><td>weight</td><td>0</td><td>1</td></tr> +<tr><td>operator</td><td><</td><td>!=</td></tr> +<tr><td>value</td><td>20</td><td>light</td></tr> +and the following parameters: threshold = 1 and target = 0 (brown). +</table></p> +<p>This rule can be read as follows:</p> +<div class="codehilite"><pre><span class="k">for</span> <span class="n">each</span> <span class="n">row</span> <span class="n">of</span> <span class="n">the</span> <span class="n">dataset</span> + <span class="k">if</span> <span class="p">(</span>0 <span class="o"><</span> 1 <span class="o">||</span> <span class="p">(</span>0 <span class="o">></span><span class="p">=</span> 1 <span class="o">&&</span> <span class="n">row</span><span class="p">.</span><span class="n">value1</span> <span class="o"><</span> 20<span class="p">))</span> <span class="o">&&</span> + <span class="p">(</span>1 <span class="o"><</span> 1 <span class="o">||</span> <span class="p">(</span>1 <span class="o">></span><span class="p">=</span> 1 <span class="o">&&</span> <span class="n">row</span><span class="p">.</span><span class="n">value2</span> !<span class="p">=</span> <span class="n">light</span><span class="p">))</span> <span class="n">then</span> + <span class="n">row</span> <span class="n">is</span> <span class="n">part</span> <span class="n">of</span> <span class="n">the</span> "<span class="n">brown</span> <span class="n">Eye</span> <span class="n">Color</span>" <span class="n">class</span> +</pre></div> + + +<p>Please note how the rule skipped the label attribute (Eye Color), and how +the first condition is ignored because its weight is < threshold.</p> +<p><a name="ClassDiscovery-Runningtheexample:"></a></p> +<h1 id="running-the-example">Running the example:</h1> +<p>NOTE: Substitute in the appropriate version for the Mahout JOB jar</p> +<ol> +<li>cd <MAHOUT_HOME>/examples</li> +<li>ant job</li> +<li>{code}<HADOOP_HOME>/bin/hadoop dfs -put +<MAHOUT_HOME>/examples/src/test/resources/wdbc wdbc{code}</li> +<li>{code}<HADOOP_HOME>/bin/hadoop dfs -put +<MAHOUT_HOME>/examples/src/test/resources/wdbc.infos wdbc.infos{code}</li> +<li> +<p>{code}<HADOOP_HOME>/bin/hadoop jar +<MAHOUT_HOME>/examples/build/apache-mahout-examples-0.1-dev.job +org.apache.mahout.ga.watchmaker.cd.CDGA +<MAHOUT_HOME>/examples/src/test/resources/wdbc 1 0.9 1 0.033 0.1 0 100 10</p> +<p>CDGA needs 9 parameters: +<em> param 1 : path of the directory that contains the dataset and its infos +file +</em> param 2 : target class +<em> param 3 : threshold +</em> param 4 : number of crossover points for the multi-point crossover +<em> param 5 : mutation rate +</em> param 6 : mutation range +<em> param 7 : mutation precision +</em> param 8 : population size +* param 9 : number of generations before the program stops</p> +<p>For more information about 4th parameter, please see [Multi-point Crossover|http://www.geatbx.com/docu/algindex-03.html#P616_36571] +. +For a detailed explanation about the 5th, 6th and 7th parameters, please +see [Real Valued Mutation|http://www.geatbx.com/docu/algindex-04.html#P659_42386] +.</p> +<p><em>TODO</em>: Fill in where to find the output and what it means.</p> +<p>h1. The info file: +To run properly, CDGA needs some informations about the dataset. Each +dataset should be accompanied by an .infos file that contains the needed +informations. for each attribute a corresponding line in the info file +describes it, it can be one of the following: +<em> IGNORED + if the attribute is ignored +</em> LABEL, val1, val2,... + if the attribute is the label (class), and its possible values +<em> CATEGORICAL, val1, val2,... + if the attribute is categorial (nominal), and its possible values +</em> NUMERICAL, min, max + if the attribute is numerical, and its min and max values</p> +<p>This file can be generated automaticaly using a special tool available with +CDGA.</p> +</li> +<li> +<p>the tool searches for an existing infos file (<em>must be filled by the +user</em>), in the same directory of the dataset with the same name and with +the ".infos" extension, that contain the type of the attributes: + <strong> 'N' numerical attribute + </strong> 'C' categorical attribute + <strong> 'L' label (this also a categorical attribute) + </strong> 'I' to ignore the attribute + each attribute is in a separate </p> +</li> +<li>A Hadoop job is used to parse the dataset and collect the informations. +This means that <em>the dataset can be distributed over HDFS</em>.</li> +<li>the results are written back in the same .info file, with the correct +format needed by CDGA.</li> +</ol> + </div> + </div> +</div> + <footer class="footer" align="center"> + <div class="container"> + <p> + Copyright © 2014 The Apache Software Foundation, Licensed under + the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>. + <br /> + Apache and the Apache feather logos are trademarks of The Apache Software Foundation. + </p> + </div> + </footer> + + <script src="/js/jquery-1.9.1.min.js"></script> + <script src="/js/bootstrap.min.js"></script> + <script> + (function() { + var cx = '012254517474945470291:vhsfv7eokdc'; + var gcse = document.createElement('script'); + gcse.type = 'text/javascript'; + gcse.async = true; + gcse.src = (document.location.protocol == 'https:' ? 'https:' : 'http:') + + '//www.google.com/cse/cse.js?cx=' + cx; + var s = document.getElementsByTagName('script')[0]; + s.parentNode.insertBefore(gcse, s); + })(); + </script> +</body> +</html> Added: websites/staging/mahout/trunk/content/users/mapreduce/classification/classifyingyourdata.html ============================================================================== --- websites/staging/mahout/trunk/content/users/mapreduce/classification/classifyingyourdata.html (added) +++ websites/staging/mahout/trunk/content/users/mapreduce/classification/classifyingyourdata.html Thu Mar 19 21:21:45 2015 @@ -0,0 +1,293 @@ +<!DOCTYPE html> +<!-- + + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> + <title>Apache Mahout: Scalable machine learning and data mining</title> + <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> + <meta name="Distribution" content="Global"> + <meta name="Robots" content="index,follow"> + <meta name="keywords" content="apache, apache hadoop, apache lucene, + business data mining, cluster analysis, + collaborative filtering, data extraction, data filtering, data framework, data integration, + data matching, data mining, data mining algorithms, data mining analysis, data mining data, + data mining introduction, data mining software, + data mining techniques, data representation, data set, datamining, + feature extraction, fuzzy k means, genetic algorithm, hadoop, + hierarchical clustering, high dimensional, introduction to data mining, kmeans, + knowledge discovery, learning approach, learning approaches, learning methods, + learning techniques, lucene, machine learning, machine translation, mahout apache, + mahout taste, map reduce hadoop, mining data, mining methods, naive bayes, + natural language processing, + supervised, text mining, time series data, unsupervised, web data mining"> + <link rel="shortcut icon" type="image/x-icon" href="http://mahout.apache.org/images/favicon.ico"> + <script type="text/javascript" src="/js/prototype.js"></script> + <script type="text/javascript" src="/js/effects.js"></script> + <script type="text/javascript" src="/js/search.js"></script> + <script type="text/javascript" src="/js/slides.js"></script> + + <link href="/css/bootstrap.min.css" rel="stylesheet" media="screen"> + <link href="/css/bootstrap-responsive.css" rel="stylesheet"> + <link rel="stylesheet" href="/css/global.css" type="text/css"> + + <!-- mathJax stuff -- use `\(...\)` for inline style math in markdown --> + <script type="text/x-mathjax-config"> + MathJax.Hub.Config({ + tex2jax: { + skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] + } + }); + MathJax.Hub.Queue(function() { + var all = MathJax.Hub.getAllJax(), i; + for(i = 0; i < all.length; i += 1) { + all[i].SourceElement().parentNode.className += ' has-jax'; + } + }); + </script> + <script type="text/javascript"> + var mathjax = document.createElement('script'); + mathjax.type = 'text/javascript'; + mathjax.async = true; + + mathjax.src = ('https:' == document.location.protocol) ? + 'https://c328740.ssl.cf1.rackcdn.com/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML' : + 'http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'; + + var s = document.getElementsByTagName('script')[0]; + s.parentNode.insertBefore(mathjax, s); + </script> +</head> + +<body id="home" data-twttr-rendered="true"> + <div id="wrap"> + <div id="header"> + <div id="logo"><a href="/overview.html"></a></div> + <div id="search"> + <form id="search-form" action="http://www.google.com/search" method="get" class="navbar-search pull-right"> + <input value="http://mahout.apache.org" name="sitesearch" type="hidden"> + <input class="search-query" name="q" id="query" type="text"> + <input id="submission" type="image" src="/images/mahout-lupe.png" alt="Search" /> + </form> + </div> + + <div class="navbar navbar-inverse" style="position:absolute;top:133px;padding-right:0px;padding-left:0px;"> + <div class="navbar-inner" style="border: none; background: #999; border: none; border-radius: 0px;"> + <div class="container"> + <button type="button" class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse"> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + </button> + <!-- <a class="brand" href="#">Apache Community Development Project</a> --> + <div class="nav-collapse collapse"> + <ul class="nav"> + <li><a href="/">Home</a></li> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">General<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li><a href="/general/downloads.html">Downloads</a> + <li><a href="/general/who-we-are.html">Who we are</a> + <li><a href="/general/mailing-lists,-irc-and-archives.html">Mailing Lists</a> + <li><a href="/general/release-notes.html">Release Notes</a> + <li><a href="/general/books-tutorials-and-talks.html">Books, Tutorials, Talks</a></li> + <li><a href="/general/powered-by-mahout.html">Powered By Mahout</a> + <li><a href="/general/professional-support.html">Professional Support</a> + <li class="divider"></li> + <li class="nav-header">Resources</li> + <li><a href="/general/reference-reading.html">Reference Reading</a> + <li><a href="/general/faq.html">FAQ</a> + <li class="divider"></li> + <li class="nav-header">Legal</li> + <li><a href="http://www.apache.org/licenses/">License</a></li> + <li><a href="http://www.apache.org/security/">Security</a></li> + <li><a href="/general/privacy-policy.html">Privacy Policy</a> + </ul> + </li> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Developers<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li><a href="/developers/developer-resources.html">Developer resources</a></li> + <li><a href="/developers/version-control.html">Version control</a></li> + <li><a href="/developers/buildingmahout.html">Build from source</a></li> + <li><a href="/developers/issue-tracker.html">Issue tracker</a></li> + <li><a href="https://builds.apache.org/job/Mahout-Quality/" target="_blank">Code quality reports</a></li> + <li class="divider"></li> + <li class="nav-header">Contributions</li> + <li><a href="/developers/how-to-contribute.html">How to contribute</a></li> + <li><a href="/developers/how-to-become-a-committer.html">How to become a committer</a></li> + <li><a href="/developers/gsoc.html">GSoC</a></li> + <li class="divider"></li> + <li class="nav-header">For committers</li> + <li><a href="/developers/how-to-update-the-website.html">How to update the website</a></li> + <li><a href="/developers/patch-check-list.html">Patch check list</a></li> + <li><a href="/developers/github.html">Handling Github PRs</a></li> + <li><a href="/developers/how-to-release.html">How to release</a></li> + <li><a href="/developers/thirdparty-dependencies.html">Third party dependencies</a></li> + </ul> + </li> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Basics<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li><a href="/users/basics/algorithms.html">List of algorithms</a> + <li><a href="/users/basics/quickstart.html">Quickstart</a> + <li class="divider"></li> + <li class="nav-header">Working with text</li> + <li><a href="/users/basics/creating-vectors-from-text.html">Creating vectors from text</a> + <li><a href="/users/basics/collocations.html">Collocations</a> + <li class="divider"></li> + <li class="nav-header">Dimensionality reduction</li> + <li><a href="/users/dim-reduction/dimensional-reduction.html">Singular Value Decomposition</a></li> + <li><a href="/users/dim-reduction/ssvd.html">Stochastic SVD</a></li> + <li class="divider"></li> + <li class="nav-header">Topic Models</li> + <li><a href="/users/clustering/latent-dirichlet-allocation.html">Latent Dirichlet Allocation</a></li> + </ul> + </li> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Spark<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li><a href="/users/sparkbindings/home.html">Scala & Spark Bindings Overview</a></li> + <li><a href="/users/sparkbindings/play-with-shell.html">Playing with Mahout's Spark Shell</a></li> + <li class="divider"></li> + <li><a href="/users/sparkbindings/faq.html">FAQ</a></li> + </ul> + </li> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Classification<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li><a href="/users/mapreduce/classification/bayesian.html">Naive Bayes</a></li> + <li><a href="/users/mapreduce/classification/hidden-markov-models.html">Hidden Markov Models</a></li> + <li><a href="/users/mapreduce/classification/logistic-regression.html">Logistic Regression</a></li> + <li><a href="/users/mapreduce/classification/partial-implementation.html">Random Forest</a></li> + + <li class="divider"></li> + <li class="nav-header">Examples</li> + <li><a href="/users/mapreduce/classification/breiman-example.html">Breiman example</a></li> + <li><a href="/users/mapreduce/classification/twenty-newsgroups.html">20 newsgroups example</a></li> + </ul></li> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Clustering<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li><a href="/users/mapreduce/clustering/k-means-clustering.html">k-Means</a></li> + <li><a href="/users/mapreduce/clustering/canopy-clustering.html">Canopy</a></li> + <li><a href="/users/mapreduce/clustering/fuzzy-k-means.html">Fuzzy k-Means</a></li> + <li><a href="/users/mapreduce/clustering/streaming-k-means.html">Streaming KMeans</a></li> + <li><a href="/users/mapreduce/clustering/spectral-clustering.html">Spectral Clustering</a></li> + <li class="divider"></li> + <li class="nav-header">Commandline usage</li> + <li><a href="/users/mapreduce/clustering/k-means-commandline.html">Options for k-Means</a></li> + <li><a href="/users/mapreduce/clustering/canopy-commandline.html">Options for Canopy</a></li> + <li><a href="/users/mapreduce/clustering/fuzzy-k-means-commandline.html">Options for Fuzzy k-Means</a></li> + <li class="divider"></li> + <li class="nav-header">Examples</li> + <li><a href="/users/mapreduce/clustering/clustering-of-synthetic-control-data.html">Synthetic data</a></li> + <li class="divider"></li> + <li class="nav-header">Post processing</li> + <li><a href="/users/mapreduce/clustering/cluster-dumper.html">Cluster Dumper tool</a></li> + <li><a href="/users/mapreduce/clustering/visualizing-sample-clusters.html">Cluster visualisation</a></li> + </ul></li> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Recommendations<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li><a href="/users/mapreduce/recommender/quickstart.html">Quickstart</a></li> + <li><a href="/users/mapreduce/recommender/recommender-first-timer-faq.html">First Timer FAQ</a></li> + <li><a href="/users/mapreduce/recommender/userbased-5-minutes.html">A user-based recommender <br/>in 5 minutes</a></li> + <li><a href="/users/mapreduce/recommender/matrix-factorization.html">Matrix factorization-based<br/> recommenders</a></li> + <li><a href="/users/mapreduce/recommender/recommender-documentation.html">Overview</a></li> + <li class="divider"></li> + <li class="nav-header">Hadoop</li> + <li><a href="/users/mapreduce/recommender/intro-itembased-hadoop.html">Intro to item-based recommendations<br/> with Hadoop</a></li> + <li><a href="/users/mapreduce/recommender/intro-als-hadoop.html">Intro to ALS recommendations<br/> with Hadoop</a></li> + <li class="nav-header">Spark</li> + <li><a href="/users/mapreduce/recommender/intro-cooccurrence-spark.html">Intro to cooccurrence-based<br/> recommendations with Spark</a></li> + </ul> + </li> + </ul> + </div><!--/.nav-collapse --> + </div> + </div> + </div> + +</div> + + <div id="sidebar"> + <div id="sidebar-wrap"> + <h2>Twitter</h2> + <ul class="sidemenu"> + <li> +<a class="twitter-timeline" href="https://twitter.com/ApacheMahout" data-widget-id="422861673444028416">Tweets by @ApacheMahout</a> +<script>!function(d,s,id){var js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+"://platform.twitter.com/widgets.js";fjs.parentNode.insertBefore(js,fjs);}}(document,"script","twitter-wjs");</script> +</li> + </ul> + <h2>Apache Software Foundation</h2> + <ul class="sidemenu"> + <li><a href="http://www.apache.org/foundation/how-it-works.html">How the ASF works</a></li> + <li><a href="http://www.apache.org/foundation/getinvolved.html">Get Involved</a></li> + <li><a href="http://www.apache.org/dev/">Developer Resources</a></li> + <li><a href="http://www.apache.org/foundation/sponsorship.html">Sponsorship</a></li> + <li><a href="http://www.apache.org/foundation/thanks.html">Thanks</a></li> + </ul> + <h2>Related Projects</h2> + <ul class="sidemenu"> + <li><a href="http://lucene.apache.org/">Lucene</a></li> + <li><a href="http://hadoop.apache.org/">Hadoop</a></li> + </ul> + </div> +</div> + + <div id="content-wrap" class="clearfix"> + <div id="main"> + <h1 id="classifying-data-from-the-command-line">Classifying data from the command line</h1> +<p>After you've done the <a href="../basics/quickstart.html">Quickstart</a> and are familiar with the basics of Mahout, it is time to build a +classifier from your own data. The following pieces <em>may</em> be useful for in getting started:</p> +<p><a name="ClassifyingYourData-Input"></a></p> +<h1 id="input">Input</h1> +<p>For starters, you will need your data in an appropriate Vector format: See <a href="../basics/creating-vectors.html">Creating Vectors</a> as well as <a href="../basics/creating-vectors-from-text.html">Creating Vectors from Text</a>.</p> +<p><a name="ClassifyingYourData-RunningtheProcess"></a></p> +<h1 id="running-the-process">Running the Process</h1> +<ul> +<li>Logistic regression <a href="logistic-regression.html">background</a></li> +<li><a href="naivebayes.html">Naive Bayes background</a> and <a href="bayesian-commandline.html">commandline</a> options.</li> +<li><a href="complementary-naive-bayes.html">Complementary naive bayes background</a>, <a href="https://issues.apache.org/jira/browse/mahout-60.html">design</a>, and <a href="c-bayes-commandline.html">c-bayes-commandline</a></li> +<li><a href="https://cwiki.apache.org/confluence/display/MAHOUT/Random+Forests">Random Forests Classification</a> comes with a <a href="breiman-example.html">Breiman example</a>. There is some really great documentation +over at <a href="http://www.markhneedham.com/blog/2012/10/27/kaggle-digit-recognizer-mahout-random-forest-attempt/">Mark Needham's blog</a>. Also checkout the description on <a href="http://shawnwan.wordpress.com/2012/06/01/mahout-0-7-random-forest-examples/">Xiaomeng Shawn Wan +s</a> blog.</li> +</ul> + </div> + </div> +</div> + <footer class="footer" align="center"> + <div class="container"> + <p> + Copyright © 2014 The Apache Software Foundation, Licensed under + the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>. + <br /> + Apache and the Apache feather logos are trademarks of The Apache Software Foundation. + </p> + </div> + </footer> + + <script src="/js/jquery-1.9.1.min.js"></script> + <script src="/js/bootstrap.min.js"></script> + <script> + (function() { + var cx = '012254517474945470291:vhsfv7eokdc'; + var gcse = document.createElement('script'); + gcse.type = 'text/javascript'; + gcse.async = true; + gcse.src = (document.location.protocol == 'https:' ? 'https:' : 'http:') + + '//www.google.com/cse/cse.js?cx=' + cx; + var s = document.getElementsByTagName('script')[0]; + s.parentNode.insertBefore(gcse, s); + })(); + </script> +</body> +</html>
