Repository: mahout
Updated Branches:
  refs/heads/website ac56b5512 -> 0e718ec99


http://git-wip-us.apache.org/repos/asf/mahout/blob/0e718ec9/website/oldsite/_site/users/algorithms/spark-naive-bayes.html
----------------------------------------------------------------------
diff --git a/website/oldsite/_site/users/algorithms/spark-naive-bayes.html 
b/website/oldsite/_site/users/algorithms/spark-naive-bayes.html
new file mode 100644
index 0000000..f699357
--- /dev/null
+++ b/website/oldsite/_site/users/algorithms/spark-naive-bayes.html
@@ -0,0 +1,464 @@
+
+
+<!DOCTYPE html>
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+-->
+
+<html xmlns="http://www.w3.org/1999/xhtml"; xml:lang="en" lang="en"><head><meta 
http-equiv="Content-Type" content="text/html; charset=UTF-8">
+  <title>Apache Mahout: Scalable machine learning and data mining</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+  <meta name="Distribution" content="Global">
+  <meta name="Robots" content="index,follow">
+  <meta name="keywords" content="apache, apache hadoop, apache lucene,
+        business data mining, cluster analysis,
+        collaborative filtering, data extraction, data filtering, data 
framework, data integration,
+        data matching, data mining, data mining algorithms, data mining 
analysis, data mining data,
+        data mining introduction, data mining software,
+        data mining techniques, data representation, data set, datamining,
+        feature extraction, fuzzy k means, genetic algorithm, hadoop,
+        hierarchical clustering, high dimensional, introduction to data 
mining, kmeans,
+        knowledge discovery, learning approach, learning approaches, learning 
methods,
+        learning techniques, lucene, machine learning, machine translation, 
mahout apache,
+        mahout taste, map reduce hadoop, mining data, mining methods, naive 
bayes,
+        natural language processing,
+        supervised, text mining, time series data, unsupervised, web data 
mining">
+  <link rel="shortcut icon" type="image/x-icon" 
href="https://mahout.apache.org/images/favicon.ico";>
+  <!--<script type="text/javascript" src="/js/prototype.js"></script>-->
+  <script type="text/javascript" 
src="https://ajax.googleapis.com/ajax/libs/prototype/1.7.2.0/prototype.js";></script>
+  <script type="text/javascript" 
src="/assets/themes/mahout-retro/js/effects.js"></script>
+  <script type="text/javascript" 
src="/assets/themes/mahout-retro/js/search.js"></script>
+  <script type="text/javascript" 
src="/assets/themes/mahout-retro/js/slides.js"></script>
+
+  <link href="/assets/themes/mahout-retro/css/bootstrap.min.css" 
rel="stylesheet" media="screen">
+  <link href="/assets/themes/mahout-retro/css/bootstrap-responsive.css" 
rel="stylesheet">
+  <link rel="stylesheet" href="/assets/themes/mahout-retro/css/global.css" 
type="text/css">
+
+  <!-- mathJax stuff -- use `\(...\)` for inline style math in markdown -->
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    tex2jax: {
+      skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
+    }
+  });
+  MathJax.Hub.Queue(function() {
+    var all = MathJax.Hub.getAllJax(), i;
+    for(i = 0; i < all.length; i += 1) {
+      all[i].SourceElement().parentNode.className += ' has-jax';
+    }
+  });
+  </script>
+  <script type="text/javascript">
+    var mathjax = document.createElement('script'); 
+    mathjax.type = 'text/javascript'; 
+    mathjax.async = true;
+
+    mathjax.src = ('https:' == document.location.protocol) ?
+        
'https://c328740.ssl.cf1.rackcdn.com/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'
 : 
+        
'http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
+       
+         var s = document.getElementsByTagName('script')[0]; 
+    s.parentNode.insertBefore(mathjax, s);
+  </script>
+</head>
+
+<body id="home" data-twttr-rendered="true">
+  <div id="wrap">
+   <div id="header">
+    <div id="logo"><a href="/"><img src="/assets/img/mahout-logo-brudman.png" 
alt="Logos for Mahout and Apache Software Foundation" /></a></div>
+  <div id="search">
+    <form id="search-form" action="http://www.google.com/search"; method="get" 
class="navbar-search pull-right">    
+      <input value="http://mahout.apache.org"; name="sitesearch" type="hidden">
+      <input class="search-query" name="q" id="query" type="text">
+      <input id="submission" type="image" src="/assets/img/mahout-lupe.png" 
alt="Search" />
+    </form>
+  </div>
+ 
+    <div class="navbar navbar-inverse" 
style="position:absolute;top:133px;padding-right:0px;padding-left:0px;">
+      <div class="navbar-inner" style="border: none; background: #999; border: 
none; border-radius: 0px;">
+        <div class="container">
+          <button type="button" class="btn btn-navbar" data-toggle="collapse" 
data-target=".nav-collapse">
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+          </button>
+          <!-- <a class="brand" href="#">Apache Community Development 
Project</a> -->
+            <!--<div class="nav-collapse collapse">-->
+<div class="collapse navbar-collapse" id="main-navbar">
+    <ul class="nav navbar-nav">
+        <!-- <li><a href="/">Home</a></li> -->
+        <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">General<b class="caret"></b></a>
+            <ul class="dropdown-menu">
+                <li><a href="/general/downloads.html">Downloads</a>
+                <li><a href="/general/who-we-are.html">Who we are</a>
+                <li><a 
href="/general/mailing-lists,-irc-and-archives.html">Mailing Lists</a>
+                <li><a href="/general/release-notes.html">Release Notes</a>
+                <li><a href="/general/books-tutorials-and-talks.html">Books, 
Tutorials, Talks</a></li>
+                <li><a href="/general/powered-by-mahout.html">Powered By 
Mahout</a>
+                <li><a href="/general/professional-support.html">Professional 
Support</a>
+                <li class="divider"></li>
+                <li class="nav-header">Resources</li>
+                <li><a href="/general/reference-reading.html">Reference 
Reading</a>
+                <li><a href="/general/faq.html">FAQ</a>
+                <li class="divider"></li>
+                <li class="nav-header">Legal</li>
+                <li><a href="http://www.apache.org/licenses/";>License</a></li>
+                <li><a href="http://www.apache.org/security/";>Security</a></li>
+                <li><a href="/general/privacy-policy.html">Privacy Policy</a>
+            </ul>
+        </li>
+        <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Developers<b class="caret"></b></a>
+            <ul class="dropdown-menu">
+                <li><a href="/developers/developer-resources.html">Developer 
resources</a></li>
+                <li><a href="/developers/version-control.html">Version 
control</a></li>
+                <li><a href="/developers/buildingmahout.html">Build from 
source</a></li>
+                <li><a href="/developers/issue-tracker.html">Issue 
tracker</a></li>
+                <li><a href="https://builds.apache.org/job/Mahout-Quality/"; 
target="_blank">Code quality reports</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Contributions</li>
+                <li><a href="/developers/how-to-contribute.html">How to 
contribute</a></li>
+                <li><a href="/developers/how-to-become-a-committer.html">How 
to become a committer</a></li>
+                <li><a href="/developers/gsoc.html">GSoC</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">For committers</li>
+                <li><a href="/developers/how-to-update-the-website.html">How 
to update the website</a></li>
+                <li><a href="/developers/patch-check-list.html">Patch check 
list</a></li>
+                <li><a href="/developers/github.html">Handling Github 
PRs</a></li>
+                <li><a href="/developers/how-to-release.html">How to 
release</a></li>
+                <li><a href="/developers/thirdparty-dependencies.html">Third 
party dependencies</a></li>
+            </ul>
+        </li>
+        <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Mahout-Samsara<b class="caret"></b></a>
+            <ul class="dropdown-menu">
+                <li><a href="/users/sparkbindings/home.html">Scala &amp; Spark 
Bindings Overview</a></li>
+                <li><a href="/users/sparkbindings/faq.html">FAQ</a></li>
+                <li><a 
href="/users/flinkbindings/playing-with-samsara-flink.html">Flink Bindings 
Overview</a></li>
+                <li class="nav-header">Engines</li>
+                <li><a href="/users/sparkbindings/home.html">Spark</a></li>
+                <li><a 
href="/users/environment/h2o-internals.html">H2O</a></li>
+                <li><a 
href="/users/flinkbindings/flink-internals.html">Flink</a></li>
+                <li class="nav-header">References</li>
+                <li><a 
href="/users/environment/in-core-reference.html">In-Core Algebraic DSL 
Reference</a></li>
+                <li><a 
href="/users/environment/out-of-core-reference.html">Distributed Algebraic DSL 
Reference</a></li>
+                <li class="nav-header">Tutorials</li>
+                <li><a 
href="/users/sparkbindings/play-with-shell.html">Playing with Mahout's Spark 
Shell</a></li>
+                <li><a href="/users/environment/how-to-build-an-app.html">How 
to build an app</a></li>
+                <li><a 
href="/users/environment/classify-a-doc-from-the-shell.html">Building a text 
classifier in Mahout's Spark Shell</a></li>
+            </ul>
+        </li>
+        <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Algorithms<b class="caret"></b></a>
+            <ul class="dropdown-menu">
+                <li><a href="/users/basics/algorithms.html">List of 
algorithms</a>
+                <li class="nav-header">Distributed Matrix Decomposition</li>
+                <li><a href="/users/algorithms/d-qr.html">Cholesky QR</a></li>
+                <li><a href="/users/algorithms/d-ssvd.html">SSVD</a></li>
+                <li><a href="/users/algorithms/d-als.html">Distributed 
ALS</a></li>
+                <li><a href="/users/algorithms/d-spca.html">SPCA</a></li>
+                <li class="nav-header">Recommendations</li>
+                <li><a 
href="/users/algorithms/recommender-overview.html">Recommender Overview</a></li>
+                <li><a 
href="/users/algorithms/intro-cooccurrence-spark.html">Intro to 
cooccurrence-based<br/> recommendations with Spark</a></li>
+                <li class="nav-header">Classification</li>
+                <li><a href="/users/algorithms/spark-naive-bayes.html">Spark 
Naive Bayes</a></li>
+            </ul>
+        </li>
+        <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">MapReduce Basics<b class="caret"></b></a>
+            <ul class="dropdown-menu">
+                <li><a href="/users/basics/algorithms.html">List of 
algorithms</a>
+                <li><a href="/users/basics/quickstart.html">Overview</a>
+                <li class="divider"></li>
+                <li class="nav-header">Working with text</li>
+                <li><a 
href="/users/basics/creating-vectors-from-text.html">Creating vectors from 
text</a>
+                <li><a href="/users/basics/collocations.html">Collocations</a>
+                <li class="divider"></li>
+                <li class="nav-header">Dimensionality reduction</li>
+                <li><a 
href="/users/dim-reduction/dimensional-reduction.html">Singular Value 
Decomposition</a></li>
+                <li><a href="/users/dim-reduction/ssvd.html">Stochastic 
SVD</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Topic Models</li>
+                <li><a 
href="/users/clustering/latent-dirichlet-allocation.html">Latent Dirichlet 
Allocation</a></li>
+            </ul>
+        </li>
+        <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Mahout MapReduce<b class="caret"></b></a>
+            <ul class="dropdown-menu">
+                <li class="nav-header">Classification</li>
+                <li><a href="/users/classification/bayesian.html">Naive 
Bayes</a></li>
+                <li><a 
href="/users/classification/hidden-markov-models.html">Hidden Markov 
Models</a></li>
+                <li><a 
href="/users/classification/logistic-regression.html">Logistic Regression 
(Single Machine)</a></li>
+                <li><a 
href="/users/classification/partial-implementation.html">Random Forest</a></li>
+                <li class="nav-header">Classification Examples</li>
+                <li><a 
href="/users/classification/breiman-example.html">Breiman example</a></li>
+                <li><a href="/users/classification/twenty-newsgroups.html">20 
newsgroups example</a></li>
+                <li><a 
href="/users/classification/bankmarketing-example.html">SGD classifier bank 
marketing</a></li>
+                <li><a 
href="/users/classification/wikipedia-classifier-example.html">Wikipedia XML 
parser and classifier</a></li>
+                <li class="nav-header">Clustering</li>
+                <li><a 
href="/users/clustering/k-means-clustering.html">k-Means</a></li>
+                <li><a 
href="/users/clustering/canopy-clustering.html">Canopy</a></li>
+                <li><a href="/users/clustering/fuzzy-k-means.html">Fuzzy 
k-Means</a></li>
+                <li><a 
href="/users/clustering/streaming-k-means.html">Streaming KMeans</a></li>
+                <li><a 
href="/users/clustering/spectral-clustering.html">Spectral Clustering</a></li>
+                <li class="nav-header">Clustering Commandline usage</li>
+                <li><a 
href="/users/clustering/k-means-commandline.html">Options for k-Means</a></li>
+                <li><a 
href="/users/clustering/canopy-commandline.html">Options for Canopy</a></li>
+                <li><a 
href="/users/clustering/fuzzy-k-means-commandline.html">Options for Fuzzy 
k-Means</a></li>
+                <li class="nav-header">Clustering Examples</li>
+                <li><a 
href="/users/clustering/clustering-of-synthetic-control-data.html">Synthetic 
data</a></li>
+                <li class="nav-header">Cluster Post processing</li>
+                <li><a href="/users/clustering/cluster-dumper.html">Cluster 
Dumper tool</a></li>
+                <li><a 
href="/users/clustering/visualizing-sample-clusters.html">Cluster 
visualisation</a></li>
+                <li class="nav-header">Recommendations</li>
+                <li><a 
href="/users/recommender/recommender-first-timer-faq.html">First Timer 
FAQ</a></li>
+                <li><a href="/users/recommender/userbased-5-minutes.html">A 
user-based recommender <br/>in 5 minutes</a></li>
+                <li><a 
href="/users/recommender/matrix-factorization.html">Matrix 
factorization-based<br/> recommenders</a></li>
+                <li><a 
href="/users/recommender/recommender-documentation.html">Overview</a></li>
+                <li><a 
href="/users/recommender/intro-itembased-hadoop.html">Intro to item-based 
recommendations<br/> with Hadoop</a></li>
+                <li><a href="/users/recommender/intro-als-hadoop.html">Intro 
to ALS recommendations<br/> with Hadoop</a></li>
+            </ul>
+        </li>
+        <!--  <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Recommendations<b class="caret"></b></a>
+          <ul class="dropdown-menu">
+
+          </ul> -->
+        </li>
+    </ul>
+</div><!--/.nav-collapse -->
+        </div>
+      </div>
+    </div>
+
+</div>
+
+ <div id="sidebar">
+  <div id="sidebar-wrap">
+    <h2>Twitter</h2>
+       <ul class="sidemenu">
+               <li>
+<a class="twitter-timeline" href="https://twitter.com/ApacheMahout"; 
data-widget-id="422861673444028416">Tweets by @ApacheMahout</a>
+<script>!function(d,s,id){var 
js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+"://platform.twitter.com/widgets.js";fjs.parentNode.insertBefore(js,fjs);}}(document,"script","twitter-wjs");</script>
+</li>
+       </ul>
+    <h2>Apache Software Foundation</h2>
+    <ul class="sidemenu">
+      <li><a href="http://www.apache.org/foundation/how-it-works.html";>How the 
ASF works</a></li>
+      <li><a href="http://www.apache.org/foundation/getinvolved.html";>Get 
Involved</a></li>
+      <li><a href="http://www.apache.org/dev/";>Developer Resources</a></li>
+      <li><a 
href="http://www.apache.org/foundation/sponsorship.html";>Sponsorship</a></li>
+      <li><a 
href="http://www.apache.org/foundation/thanks.html";>Thanks</a></li>
+    </ul>
+    <h2>Related Projects</h2>
+    <ul class="sidemenu">
+      <li><a href="http://lucene.apache.org/";>Apache Lucene</a></li>
+      <li><a href="http://hadoop.apache.org/";>Apache Hadoop</a></li>
+      <li><a href="http://bigtop.apache.org/";>Apache Bigtop</a></li>
+      <li><a href="http://spark.apache.org/";>Apache Spark</a></li>
+         <li><a href="http://flink.apache.org/";>Apache Flink</a></li>
+    </ul>
+  </div>
+</div>
+
+  <div id="content-wrap" class="clearfix">
+   <div id="main">
+
+    <h1 id="spark-naive-bayes">Spark Naive Bayes</h1>
+
+<h2 id="intro">Intro</h2>
+
+<p>Mahout currently has two flavors of Naive Bayes.  The first is standard 
Multinomial Naive Bayes. The second is an implementation of Transformed 
Weight-normalized Complement Naive Bayes as introduced by Rennie et al. <a 
href="http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf";>[1]</a>. We 
refer to the former as Bayes and the latter as CBayes.</p>
+
+<p>Where Bayes has long been a standard in text classification, CBayes is an 
extension of Bayes that performs particularly well on datasets with skewed 
classes and has been shown to be competitive with algorithms of higher 
complexity such as Support Vector Machines.</p>
+
+<h2 id="implementations">Implementations</h2>
+<p>The mahout <code class="highlighter-rouge">math-scala</code> library has an 
implemetation of both Bayes and CBayes which is further optimized in the <code 
class="highlighter-rouge">spark</code> module. Currently the Spark optimized 
version provides CLI drivers for training and testing. Mahout Spark-Naive-Bayes 
models can also be trained, tested and saved to the filesystem from the Mahout 
Spark Shell.</p>
+
+<h2 id="preprocessing-and-algorithm">Preprocessing and Algorithm</h2>
+
+<p>As described in <a 
href="http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf";>[1]</a> Mahout 
Naive Bayes is broken down into the following steps (assignments are over all 
possible index values):</p>
+
+<ul>
+  <li>Let <code 
class="highlighter-rouge">\(\vec{d}=(\vec{d_1},...,\vec{d_n})\)</code> be a set 
of documents; <code class="highlighter-rouge">\(d_{ij}\)</code> is the count of 
word <code class="highlighter-rouge">\(i\)</code> in document <code 
class="highlighter-rouge">\(j\)</code>.</li>
+  <li>Let <code class="highlighter-rouge">\(\vec{y}=(y_1,...,y_n)\)</code> be 
their labels.</li>
+  <li>Let <code class="highlighter-rouge">\(\alpha_i\)</code> be a smoothing 
parameter for all words in the vocabulary; let <code 
class="highlighter-rouge">\(\alpha=\sum_i{\alpha_i}\)</code>.</li>
+  <li><strong>Preprocessing</strong>(via seq2Sparse) TF-IDF transformation and 
L2 length normalization of <code class="highlighter-rouge">\(\vec{d}\)</code>
+    <ol>
+      <li><code class="highlighter-rouge">\(d_{ij} = 
\sqrt{d_{ij}}\)</code></li>
+      <li><code class="highlighter-rouge">\(d_{ij} = 
d_{ij}\left(\log{\frac{\sum_k1}{\sum_k\delta_{ik}+1}}+1\right)\)</code></li>
+      <li><code class="highlighter-rouge">\(d_{ij} 
=\frac{d_{ij}}{\sqrt{\sum_k{d_{kj}^2}}}\)</code></li>
+    </ol>
+  </li>
+  <li><strong>Training: Bayes</strong><code 
class="highlighter-rouge">\((\vec{d},\vec{y})\)</code> calculate term weights 
<code class="highlighter-rouge">\(w_{ci}\)</code> as:
+    <ol>
+      <li><code 
class="highlighter-rouge">\(\hat\theta_{ci}=\frac{d_{ic}+\alpha_i}{\sum_k{d_{kc}}+\alpha}\)</code></li>
+      <li><code 
class="highlighter-rouge">\(w_{ci}=\log{\hat\theta_{ci}}\)</code></li>
+    </ol>
+  </li>
+  <li><strong>Training: CBayes</strong><code 
class="highlighter-rouge">\((\vec{d},\vec{y})\)</code> calculate term weights 
<code class="highlighter-rouge">\(w_{ci}\)</code> as:
+    <ol>
+      <li><code class="highlighter-rouge">\(\hat\theta_{ci} = 
\frac{\sum_{j:y_j\neq c}d_{ij}+\alpha_i}{\sum_{j:y_j\neq 
c}{\sum_k{d_{kj}}}+\alpha}\)</code></li>
+      <li><code 
class="highlighter-rouge">\(w_{ci}=-\log{\hat\theta_{ci}}\)</code></li>
+      <li><code class="highlighter-rouge">\(w_{ci}=\frac{w_{ci}}{\sum_i \lvert 
w_{ci}\rvert}\)</code></li>
+    </ol>
+  </li>
+  <li><strong>Label Assignment/Testing:</strong>
+    <ol>
+      <li>Let <code class="highlighter-rouge">\(\vec{t}= 
(t_1,...,t_n)\)</code> be a test document; let <code 
class="highlighter-rouge">\(t_i\)</code> be the count of the word <code 
class="highlighter-rouge">\(t\)</code>.</li>
+      <li>Label the document according to <code 
class="highlighter-rouge">\(l(t)=\arg\max_c \sum\limits_{i} t_i 
w_{ci}\)</code></li>
+    </ol>
+  </li>
+</ul>
+
+<p>As we can see, the main difference between Bayes and CBayes is the weight 
calculation step.  Where Bayes weighs terms more heavily based on the 
likelihood that they belong to class <code 
class="highlighter-rouge">\(c\)</code>, CBayes seeks to maximize term weights 
on the likelihood that they do not belong to any other class.</p>
+
+<h2 id="running-from-the-command-line">Running from the command line</h2>
+
+<p>Mahout provides CLI drivers for all above steps.  Here we will give a 
simple overview of Mahout CLI commands used to preprocess the data, train the 
model and assign labels to the training set. An <a 
href="https://github.com/apache/mahout/blob/master/examples/bin/classify-20newsgroups.sh";>example
 script</a> is given for the full process from data acquisition through 
classification of the classic <a 
href="https://mahout.apache.org/users/classification/twenty-newsgroups.html";>20 
Newsgroups corpus</a>.</p>
+
+<ul>
+  <li>
+    <p><strong>Preprocessing:</strong>
+For a set of Sequence File Formatted documents in PATH_TO_SEQUENCE_FILES the 
<a 
href="https://mahout.apache.org/users/basics/creating-vectors-from-text.html";>mahout
 seq2sparse</a> command performs the TF-IDF transformations (-wt tfidf option) 
and L2 length normalization (-n 2 option) as follows:</p>
+
+    <div class="highlighter-rouge"><pre class="highlight"><code>  $ mahout 
seq2sparse 
+    -i ${PATH_TO_SEQUENCE_FILES} 
+    -o ${PATH_TO_TFIDF_VECTORS} 
+    -nv 
+    -n 2
+    -wt tfidf
+</code></pre>
+    </div>
+  </li>
+  <li>
+    <p><strong>Training:</strong>
+The model is then trained using <code class="highlighter-rouge">mahout 
spark-trainnb</code>.  The default is to train a Bayes model. The -c option is 
given to train a CBayes model:</p>
+
+    <div class="highlighter-rouge"><pre class="highlight"><code>  $ mahout 
spark-trainnb
+    -i ${PATH_TO_TFIDF_VECTORS} 
+    -o ${PATH_TO_MODEL}
+    -ow 
+    -c
+</code></pre>
+    </div>
+  </li>
+  <li>
+    <p><strong>Label Assignment/Testing:</strong>
+Classification and testing on a holdout set can then be performed via <code 
class="highlighter-rouge">mahout spark-testnb</code>. Again, the -c option 
indicates that the model is CBayes:</p>
+
+    <div class="highlighter-rouge"><pre class="highlight"><code>  $ mahout 
spark-testnb 
+    -i ${PATH_TO_TFIDF_TEST_VECTORS}
+    -m ${PATH_TO_MODEL} 
+    -c 
+</code></pre>
+    </div>
+  </li>
+</ul>
+
+<h2 id="command-line-options">Command line options</h2>
+
+<ul>
+  <li>
+    <p><strong>Preprocessing:</strong> <em>note: still reliant on MapReduce 
seq2sparse</em></p>
+
+    <p>Only relevant parameters used for Bayes/CBayes as detailed above are 
shown. Several other transformations can be performed by <code 
class="highlighter-rouge">mahout seq2sparse</code> and used as input to 
Bayes/CBayes.  For a full list of <code class="highlighter-rouge">mahout 
seq2Sparse</code> options see the <a 
href="https://mahout.apache.org/users/basics/creating-vectors-from-text.html";>Creating
 vectors from text</a> page.</p>
+
+    <div class="highlighter-rouge"><pre class="highlight"><code>  $ mahout 
seq2sparse                         
+    --output (-o) output             The directory pathname for output.        
+    --input (-i) input               Path to job input directory.              
+    --weight (-wt) weight            The kind of weight to use. Currently TF   
+                                         or TFIDF. Default: TFIDF              
    
+    --norm (-n) norm                 The norm to use, expressed as either a    
+                                         float or "INF" if you want to use the 
    
+                                         Infinite norm.  Must be greater or 
equal  
+                                         to 0.  The default is not to 
normalize    
+    --overwrite (-ow)                If set, overwrite the output directory    
+    --sequentialAccessVector (-seq)  (Optional) Whether output vectors should  
+                                         be SequentialAccessVectors. If set 
true   
+                                         else false                            
    
+    --namedVector (-nv)              (Optional) Whether output vectors should  
+                                         be NamedVectors. If set true else 
false   
+</code></pre>
+    </div>
+  </li>
+  <li>
+    <p><strong>Training:</strong></p>
+
+    <div class="highlighter-rouge"><pre class="highlight"><code>  $ mahout 
spark-trainnb
+    --input (-i) input               Path to job input directory.              
   
+    --output (-o) output             The directory pathname for output.        
   
+    --trainComplementary (-c)        Train complementary? Default is false.
+    --master (-ma)                   Spark Master URL (optional). Default: 
"local".
+                                         Note that you can specify the number 
of 
+                                         cores to get a performance 
improvement, 
+                                         for example "local[4]"
+    --help (-h)                      Print out help                            
   
+</code></pre>
+    </div>
+  </li>
+  <li>
+    <p><strong>Testing:</strong></p>
+
+    <div class="highlighter-rouge"><pre class="highlight"><code>  $ mahout 
spark-testnb   
+    --input (-i) input               Path to job input directory.              
    
+    --model (-m) model               The path to the model built during 
training.   
+    --testComplementary (-c)         Test complementary? Default is false.     
                     
+    --master (-ma)                   Spark Master URL (optional). Default: 
"local". 
+                                         Note that you can specify the number 
of 
+                                         cores to get a performance 
improvement, 
+                                         for example "local[4]"                
        
+    --help (-h)                      Print out help                            
    
+</code></pre>
+    </div>
+  </li>
+</ul>
+
+<h2 id="examples">Examples</h2>
+<ol>
+  <li><a 
href="https://github.com/apache/mahout/blob/master/examples/bin/classify-20newsgroups.sh";>20
 Newsgroups classification</a></li>
+  <li><a 
href="https://github.com/apache/mahout/blob/master/examples/bin/spark-document-classifier.mscala";>Document
 classification with Naive Bayes in the Mahout shell</a></li>
+</ol>
+
+<h2 id="references">References</h2>
+
+
+   </div>
+  </div>     
+</div> 
+  <footer class="footer" align="center">
+    <div class="container">
+      <p>
+        Copyright &copy; 2014-2016 The Apache Software Foundation, Licensed 
under
+        the <a href="http://www.apache.org/licenses/LICENSE-2.0";>Apache 
License, Version 2.0</a>.
+        <br />
+                 Apache Mahout, Mahout, Apache, the Apache feather logo, and 
the elephant rider logo are either registered trademarks or trademarks of <a 
href="http://www.apache.org/foundation/marks/";>The Apache Software 
Foundation</a> in the United States and other countries.
+      </p>
+    </div>
+  </footer>
+  
+  <script src="/assets/themes/mahout-retro/js/jquery-1.9.1.min.js"></script>
+  <script src="/assets/themes/mahout-retro/js/bootstrap.min.js"></script>
+  <script>
+    (function() {
+      var cx = '012254517474945470291:vhsfv7eokdc';
+      var gcse = document.createElement('script');
+      gcse.type = 'text/javascript';
+      gcse.async = true;
+      gcse.src = (document.location.protocol == 'https:' ? 'https:' : 'http:') 
+
+          '//www.google.com/cse/cse.js?cx=' + cx;
+      var s = document.getElementsByTagName('script')[0];
+      s.parentNode.insertBefore(gcse, s);
+    })();
+  </script>
+</body>
+</html>
+

Reply via email to