Repository: mahout Updated Branches: refs/heads/website ac56b5512 -> 0e718ec99
http://git-wip-us.apache.org/repos/asf/mahout/blob/0e718ec9/website/oldsite/_site/users/algorithms/spark-naive-bayes.html ---------------------------------------------------------------------- diff --git a/website/oldsite/_site/users/algorithms/spark-naive-bayes.html b/website/oldsite/_site/users/algorithms/spark-naive-bayes.html new file mode 100644 index 0000000..f699357 --- /dev/null +++ b/website/oldsite/_site/users/algorithms/spark-naive-bayes.html @@ -0,0 +1,464 @@ + + +<!DOCTYPE html> +<!-- + + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> + <title>Apache Mahout: Scalable machine learning and data mining</title> + <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> + <meta name="Distribution" content="Global"> + <meta name="Robots" content="index,follow"> + <meta name="keywords" content="apache, apache hadoop, apache lucene, + business data mining, cluster analysis, + collaborative filtering, data extraction, data filtering, data framework, data integration, + data matching, data mining, data mining algorithms, data mining analysis, data mining data, + data mining introduction, data mining software, + data mining techniques, data representation, data set, datamining, + feature extraction, fuzzy k means, genetic algorithm, hadoop, + hierarchical clustering, high dimensional, introduction to data mining, kmeans, + knowledge discovery, learning approach, learning approaches, learning methods, + learning techniques, lucene, machine learning, machine translation, mahout apache, + mahout taste, map reduce hadoop, mining data, mining methods, naive bayes, + natural language processing, + supervised, text mining, time series data, unsupervised, web data mining"> + <link rel="shortcut icon" type="image/x-icon" href="https://mahout.apache.org/images/favicon.ico"> + <!--<script type="text/javascript" src="/js/prototype.js"></script>--> + <script type="text/javascript" src="https://ajax.googleapis.com/ajax/libs/prototype/1.7.2.0/prototype.js"></script> + <script type="text/javascript" src="/assets/themes/mahout-retro/js/effects.js"></script> + <script type="text/javascript" src="/assets/themes/mahout-retro/js/search.js"></script> + <script type="text/javascript" src="/assets/themes/mahout-retro/js/slides.js"></script> + + <link href="/assets/themes/mahout-retro/css/bootstrap.min.css" rel="stylesheet" media="screen"> + <link href="/assets/themes/mahout-retro/css/bootstrap-responsive.css" rel="stylesheet"> + <link rel="stylesheet" href="/assets/themes/mahout-retro/css/global.css" type="text/css"> + + <!-- mathJax stuff -- use `\(...\)` for inline style math in markdown --> + <script type="text/x-mathjax-config"> + MathJax.Hub.Config({ + tex2jax: { + skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] + } + }); + MathJax.Hub.Queue(function() { + var all = MathJax.Hub.getAllJax(), i; + for(i = 0; i < all.length; i += 1) { + all[i].SourceElement().parentNode.className += ' has-jax'; + } + }); + </script> + <script type="text/javascript"> + var mathjax = document.createElement('script'); + mathjax.type = 'text/javascript'; + mathjax.async = true; + + mathjax.src = ('https:' == document.location.protocol) ? + 'https://c328740.ssl.cf1.rackcdn.com/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML' : + 'http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'; + + var s = document.getElementsByTagName('script')[0]; + s.parentNode.insertBefore(mathjax, s); + </script> +</head> + +<body id="home" data-twttr-rendered="true"> + <div id="wrap"> + <div id="header"> + <div id="logo"><a href="/"><img src="/assets/img/mahout-logo-brudman.png" alt="Logos for Mahout and Apache Software Foundation" /></a></div> + <div id="search"> + <form id="search-form" action="http://www.google.com/search" method="get" class="navbar-search pull-right"> + <input value="http://mahout.apache.org" name="sitesearch" type="hidden"> + <input class="search-query" name="q" id="query" type="text"> + <input id="submission" type="image" src="/assets/img/mahout-lupe.png" alt="Search" /> + </form> + </div> + + <div class="navbar navbar-inverse" style="position:absolute;top:133px;padding-right:0px;padding-left:0px;"> + <div class="navbar-inner" style="border: none; background: #999; border: none; border-radius: 0px;"> + <div class="container"> + <button type="button" class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse"> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + </button> + <!-- <a class="brand" href="#">Apache Community Development Project</a> --> + <!--<div class="nav-collapse collapse">--> +<div class="collapse navbar-collapse" id="main-navbar"> + <ul class="nav navbar-nav"> + <!-- <li><a href="/">Home</a></li> --> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">General<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li><a href="/general/downloads.html">Downloads</a> + <li><a href="/general/who-we-are.html">Who we are</a> + <li><a href="/general/mailing-lists,-irc-and-archives.html">Mailing Lists</a> + <li><a href="/general/release-notes.html">Release Notes</a> + <li><a href="/general/books-tutorials-and-talks.html">Books, Tutorials, Talks</a></li> + <li><a href="/general/powered-by-mahout.html">Powered By Mahout</a> + <li><a href="/general/professional-support.html">Professional Support</a> + <li class="divider"></li> + <li class="nav-header">Resources</li> + <li><a href="/general/reference-reading.html">Reference Reading</a> + <li><a href="/general/faq.html">FAQ</a> + <li class="divider"></li> + <li class="nav-header">Legal</li> + <li><a href="http://www.apache.org/licenses/">License</a></li> + <li><a href="http://www.apache.org/security/">Security</a></li> + <li><a href="/general/privacy-policy.html">Privacy Policy</a> + </ul> + </li> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Developers<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li><a href="/developers/developer-resources.html">Developer resources</a></li> + <li><a href="/developers/version-control.html">Version control</a></li> + <li><a href="/developers/buildingmahout.html">Build from source</a></li> + <li><a href="/developers/issue-tracker.html">Issue tracker</a></li> + <li><a href="https://builds.apache.org/job/Mahout-Quality/" target="_blank">Code quality reports</a></li> + <li class="divider"></li> + <li class="nav-header">Contributions</li> + <li><a href="/developers/how-to-contribute.html">How to contribute</a></li> + <li><a href="/developers/how-to-become-a-committer.html">How to become a committer</a></li> + <li><a href="/developers/gsoc.html">GSoC</a></li> + <li class="divider"></li> + <li class="nav-header">For committers</li> + <li><a href="/developers/how-to-update-the-website.html">How to update the website</a></li> + <li><a href="/developers/patch-check-list.html">Patch check list</a></li> + <li><a href="/developers/github.html">Handling Github PRs</a></li> + <li><a href="/developers/how-to-release.html">How to release</a></li> + <li><a href="/developers/thirdparty-dependencies.html">Third party dependencies</a></li> + </ul> + </li> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Mahout-Samsara<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li><a href="/users/sparkbindings/home.html">Scala & Spark Bindings Overview</a></li> + <li><a href="/users/sparkbindings/faq.html">FAQ</a></li> + <li><a href="/users/flinkbindings/playing-with-samsara-flink.html">Flink Bindings Overview</a></li> + <li class="nav-header">Engines</li> + <li><a href="/users/sparkbindings/home.html">Spark</a></li> + <li><a href="/users/environment/h2o-internals.html">H2O</a></li> + <li><a href="/users/flinkbindings/flink-internals.html">Flink</a></li> + <li class="nav-header">References</li> + <li><a href="/users/environment/in-core-reference.html">In-Core Algebraic DSL Reference</a></li> + <li><a href="/users/environment/out-of-core-reference.html">Distributed Algebraic DSL Reference</a></li> + <li class="nav-header">Tutorials</li> + <li><a href="/users/sparkbindings/play-with-shell.html">Playing with Mahout's Spark Shell</a></li> + <li><a href="/users/environment/how-to-build-an-app.html">How to build an app</a></li> + <li><a href="/users/environment/classify-a-doc-from-the-shell.html">Building a text classifier in Mahout's Spark Shell</a></li> + </ul> + </li> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Algorithms<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li><a href="/users/basics/algorithms.html">List of algorithms</a> + <li class="nav-header">Distributed Matrix Decomposition</li> + <li><a href="/users/algorithms/d-qr.html">Cholesky QR</a></li> + <li><a href="/users/algorithms/d-ssvd.html">SSVD</a></li> + <li><a href="/users/algorithms/d-als.html">Distributed ALS</a></li> + <li><a href="/users/algorithms/d-spca.html">SPCA</a></li> + <li class="nav-header">Recommendations</li> + <li><a href="/users/algorithms/recommender-overview.html">Recommender Overview</a></li> + <li><a href="/users/algorithms/intro-cooccurrence-spark.html">Intro to cooccurrence-based<br/> recommendations with Spark</a></li> + <li class="nav-header">Classification</li> + <li><a href="/users/algorithms/spark-naive-bayes.html">Spark Naive Bayes</a></li> + </ul> + </li> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">MapReduce Basics<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li><a href="/users/basics/algorithms.html">List of algorithms</a> + <li><a href="/users/basics/quickstart.html">Overview</a> + <li class="divider"></li> + <li class="nav-header">Working with text</li> + <li><a href="/users/basics/creating-vectors-from-text.html">Creating vectors from text</a> + <li><a href="/users/basics/collocations.html">Collocations</a> + <li class="divider"></li> + <li class="nav-header">Dimensionality reduction</li> + <li><a href="/users/dim-reduction/dimensional-reduction.html">Singular Value Decomposition</a></li> + <li><a href="/users/dim-reduction/ssvd.html">Stochastic SVD</a></li> + <li class="divider"></li> + <li class="nav-header">Topic Models</li> + <li><a href="/users/clustering/latent-dirichlet-allocation.html">Latent Dirichlet Allocation</a></li> + </ul> + </li> + <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Mahout MapReduce<b class="caret"></b></a> + <ul class="dropdown-menu"> + <li class="nav-header">Classification</li> + <li><a href="/users/classification/bayesian.html">Naive Bayes</a></li> + <li><a href="/users/classification/hidden-markov-models.html">Hidden Markov Models</a></li> + <li><a href="/users/classification/logistic-regression.html">Logistic Regression (Single Machine)</a></li> + <li><a href="/users/classification/partial-implementation.html">Random Forest</a></li> + <li class="nav-header">Classification Examples</li> + <li><a href="/users/classification/breiman-example.html">Breiman example</a></li> + <li><a href="/users/classification/twenty-newsgroups.html">20 newsgroups example</a></li> + <li><a href="/users/classification/bankmarketing-example.html">SGD classifier bank marketing</a></li> + <li><a href="/users/classification/wikipedia-classifier-example.html">Wikipedia XML parser and classifier</a></li> + <li class="nav-header">Clustering</li> + <li><a href="/users/clustering/k-means-clustering.html">k-Means</a></li> + <li><a href="/users/clustering/canopy-clustering.html">Canopy</a></li> + <li><a href="/users/clustering/fuzzy-k-means.html">Fuzzy k-Means</a></li> + <li><a href="/users/clustering/streaming-k-means.html">Streaming KMeans</a></li> + <li><a href="/users/clustering/spectral-clustering.html">Spectral Clustering</a></li> + <li class="nav-header">Clustering Commandline usage</li> + <li><a href="/users/clustering/k-means-commandline.html">Options for k-Means</a></li> + <li><a href="/users/clustering/canopy-commandline.html">Options for Canopy</a></li> + <li><a href="/users/clustering/fuzzy-k-means-commandline.html">Options for Fuzzy k-Means</a></li> + <li class="nav-header">Clustering Examples</li> + <li><a href="/users/clustering/clustering-of-synthetic-control-data.html">Synthetic data</a></li> + <li class="nav-header">Cluster Post processing</li> + <li><a href="/users/clustering/cluster-dumper.html">Cluster Dumper tool</a></li> + <li><a href="/users/clustering/visualizing-sample-clusters.html">Cluster visualisation</a></li> + <li class="nav-header">Recommendations</li> + <li><a href="/users/recommender/recommender-first-timer-faq.html">First Timer FAQ</a></li> + <li><a href="/users/recommender/userbased-5-minutes.html">A user-based recommender <br/>in 5 minutes</a></li> + <li><a href="/users/recommender/matrix-factorization.html">Matrix factorization-based<br/> recommenders</a></li> + <li><a href="/users/recommender/recommender-documentation.html">Overview</a></li> + <li><a href="/users/recommender/intro-itembased-hadoop.html">Intro to item-based recommendations<br/> with Hadoop</a></li> + <li><a href="/users/recommender/intro-als-hadoop.html">Intro to ALS recommendations<br/> with Hadoop</a></li> + </ul> + </li> + <!-- <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Recommendations<b class="caret"></b></a> + <ul class="dropdown-menu"> + + </ul> --> + </li> + </ul> +</div><!--/.nav-collapse --> + </div> + </div> + </div> + +</div> + + <div id="sidebar"> + <div id="sidebar-wrap"> + <h2>Twitter</h2> + <ul class="sidemenu"> + <li> +<a class="twitter-timeline" href="https://twitter.com/ApacheMahout" data-widget-id="422861673444028416">Tweets by @ApacheMahout</a> +<script>!function(d,s,id){var js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+"://platform.twitter.com/widgets.js";fjs.parentNode.insertBefore(js,fjs);}}(document,"script","twitter-wjs");</script> +</li> + </ul> + <h2>Apache Software Foundation</h2> + <ul class="sidemenu"> + <li><a href="http://www.apache.org/foundation/how-it-works.html">How the ASF works</a></li> + <li><a href="http://www.apache.org/foundation/getinvolved.html">Get Involved</a></li> + <li><a href="http://www.apache.org/dev/">Developer Resources</a></li> + <li><a href="http://www.apache.org/foundation/sponsorship.html">Sponsorship</a></li> + <li><a href="http://www.apache.org/foundation/thanks.html">Thanks</a></li> + </ul> + <h2>Related Projects</h2> + <ul class="sidemenu"> + <li><a href="http://lucene.apache.org/">Apache Lucene</a></li> + <li><a href="http://hadoop.apache.org/">Apache Hadoop</a></li> + <li><a href="http://bigtop.apache.org/">Apache Bigtop</a></li> + <li><a href="http://spark.apache.org/">Apache Spark</a></li> + <li><a href="http://flink.apache.org/">Apache Flink</a></li> + </ul> + </div> +</div> + + <div id="content-wrap" class="clearfix"> + <div id="main"> + + <h1 id="spark-naive-bayes">Spark Naive Bayes</h1> + +<h2 id="intro">Intro</h2> + +<p>Mahout currently has two flavors of Naive Bayes. The first is standard Multinomial Naive Bayes. The second is an implementation of Transformed Weight-normalized Complement Naive Bayes as introduced by Rennie et al. <a href="http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf">[1]</a>. We refer to the former as Bayes and the latter as CBayes.</p> + +<p>Where Bayes has long been a standard in text classification, CBayes is an extension of Bayes that performs particularly well on datasets with skewed classes and has been shown to be competitive with algorithms of higher complexity such as Support Vector Machines.</p> + +<h2 id="implementations">Implementations</h2> +<p>The mahout <code class="highlighter-rouge">math-scala</code> library has an implemetation of both Bayes and CBayes which is further optimized in the <code class="highlighter-rouge">spark</code> module. Currently the Spark optimized version provides CLI drivers for training and testing. Mahout Spark-Naive-Bayes models can also be trained, tested and saved to the filesystem from the Mahout Spark Shell.</p> + +<h2 id="preprocessing-and-algorithm">Preprocessing and Algorithm</h2> + +<p>As described in <a href="http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf">[1]</a> Mahout Naive Bayes is broken down into the following steps (assignments are over all possible index values):</p> + +<ul> + <li>Let <code class="highlighter-rouge">\(\vec{d}=(\vec{d_1},...,\vec{d_n})\)</code> be a set of documents; <code class="highlighter-rouge">\(d_{ij}\)</code> is the count of word <code class="highlighter-rouge">\(i\)</code> in document <code class="highlighter-rouge">\(j\)</code>.</li> + <li>Let <code class="highlighter-rouge">\(\vec{y}=(y_1,...,y_n)\)</code> be their labels.</li> + <li>Let <code class="highlighter-rouge">\(\alpha_i\)</code> be a smoothing parameter for all words in the vocabulary; let <code class="highlighter-rouge">\(\alpha=\sum_i{\alpha_i}\)</code>.</li> + <li><strong>Preprocessing</strong>(via seq2Sparse) TF-IDF transformation and L2 length normalization of <code class="highlighter-rouge">\(\vec{d}\)</code> + <ol> + <li><code class="highlighter-rouge">\(d_{ij} = \sqrt{d_{ij}}\)</code></li> + <li><code class="highlighter-rouge">\(d_{ij} = d_{ij}\left(\log{\frac{\sum_k1}{\sum_k\delta_{ik}+1}}+1\right)\)</code></li> + <li><code class="highlighter-rouge">\(d_{ij} =\frac{d_{ij}}{\sqrt{\sum_k{d_{kj}^2}}}\)</code></li> + </ol> + </li> + <li><strong>Training: Bayes</strong><code class="highlighter-rouge">\((\vec{d},\vec{y})\)</code> calculate term weights <code class="highlighter-rouge">\(w_{ci}\)</code> as: + <ol> + <li><code class="highlighter-rouge">\(\hat\theta_{ci}=\frac{d_{ic}+\alpha_i}{\sum_k{d_{kc}}+\alpha}\)</code></li> + <li><code class="highlighter-rouge">\(w_{ci}=\log{\hat\theta_{ci}}\)</code></li> + </ol> + </li> + <li><strong>Training: CBayes</strong><code class="highlighter-rouge">\((\vec{d},\vec{y})\)</code> calculate term weights <code class="highlighter-rouge">\(w_{ci}\)</code> as: + <ol> + <li><code class="highlighter-rouge">\(\hat\theta_{ci} = \frac{\sum_{j:y_j\neq c}d_{ij}+\alpha_i}{\sum_{j:y_j\neq c}{\sum_k{d_{kj}}}+\alpha}\)</code></li> + <li><code class="highlighter-rouge">\(w_{ci}=-\log{\hat\theta_{ci}}\)</code></li> + <li><code class="highlighter-rouge">\(w_{ci}=\frac{w_{ci}}{\sum_i \lvert w_{ci}\rvert}\)</code></li> + </ol> + </li> + <li><strong>Label Assignment/Testing:</strong> + <ol> + <li>Let <code class="highlighter-rouge">\(\vec{t}= (t_1,...,t_n)\)</code> be a test document; let <code class="highlighter-rouge">\(t_i\)</code> be the count of the word <code class="highlighter-rouge">\(t\)</code>.</li> + <li>Label the document according to <code class="highlighter-rouge">\(l(t)=\arg\max_c \sum\limits_{i} t_i w_{ci}\)</code></li> + </ol> + </li> +</ul> + +<p>As we can see, the main difference between Bayes and CBayes is the weight calculation step. Where Bayes weighs terms more heavily based on the likelihood that they belong to class <code class="highlighter-rouge">\(c\)</code>, CBayes seeks to maximize term weights on the likelihood that they do not belong to any other class.</p> + +<h2 id="running-from-the-command-line">Running from the command line</h2> + +<p>Mahout provides CLI drivers for all above steps. Here we will give a simple overview of Mahout CLI commands used to preprocess the data, train the model and assign labels to the training set. An <a href="https://github.com/apache/mahout/blob/master/examples/bin/classify-20newsgroups.sh">example script</a> is given for the full process from data acquisition through classification of the classic <a href="https://mahout.apache.org/users/classification/twenty-newsgroups.html">20 Newsgroups corpus</a>.</p> + +<ul> + <li> + <p><strong>Preprocessing:</strong> +For a set of Sequence File Formatted documents in PATH_TO_SEQUENCE_FILES the <a href="https://mahout.apache.org/users/basics/creating-vectors-from-text.html">mahout seq2sparse</a> command performs the TF-IDF transformations (-wt tfidf option) and L2 length normalization (-n 2 option) as follows:</p> + + <div class="highlighter-rouge"><pre class="highlight"><code> $ mahout seq2sparse + -i ${PATH_TO_SEQUENCE_FILES} + -o ${PATH_TO_TFIDF_VECTORS} + -nv + -n 2 + -wt tfidf +</code></pre> + </div> + </li> + <li> + <p><strong>Training:</strong> +The model is then trained using <code class="highlighter-rouge">mahout spark-trainnb</code>. The default is to train a Bayes model. The -c option is given to train a CBayes model:</p> + + <div class="highlighter-rouge"><pre class="highlight"><code> $ mahout spark-trainnb + -i ${PATH_TO_TFIDF_VECTORS} + -o ${PATH_TO_MODEL} + -ow + -c +</code></pre> + </div> + </li> + <li> + <p><strong>Label Assignment/Testing:</strong> +Classification and testing on a holdout set can then be performed via <code class="highlighter-rouge">mahout spark-testnb</code>. Again, the -c option indicates that the model is CBayes:</p> + + <div class="highlighter-rouge"><pre class="highlight"><code> $ mahout spark-testnb + -i ${PATH_TO_TFIDF_TEST_VECTORS} + -m ${PATH_TO_MODEL} + -c +</code></pre> + </div> + </li> +</ul> + +<h2 id="command-line-options">Command line options</h2> + +<ul> + <li> + <p><strong>Preprocessing:</strong> <em>note: still reliant on MapReduce seq2sparse</em></p> + + <p>Only relevant parameters used for Bayes/CBayes as detailed above are shown. Several other transformations can be performed by <code class="highlighter-rouge">mahout seq2sparse</code> and used as input to Bayes/CBayes. For a full list of <code class="highlighter-rouge">mahout seq2Sparse</code> options see the <a href="https://mahout.apache.org/users/basics/creating-vectors-from-text.html">Creating vectors from text</a> page.</p> + + <div class="highlighter-rouge"><pre class="highlight"><code> $ mahout seq2sparse + --output (-o) output The directory pathname for output. + --input (-i) input Path to job input directory. + --weight (-wt) weight The kind of weight to use. Currently TF + or TFIDF. Default: TFIDF + --norm (-n) norm The norm to use, expressed as either a + float or "INF" if you want to use the + Infinite norm. Must be greater or equal + to 0. The default is not to normalize + --overwrite (-ow) If set, overwrite the output directory + --sequentialAccessVector (-seq) (Optional) Whether output vectors should + be SequentialAccessVectors. If set true + else false + --namedVector (-nv) (Optional) Whether output vectors should + be NamedVectors. If set true else false +</code></pre> + </div> + </li> + <li> + <p><strong>Training:</strong></p> + + <div class="highlighter-rouge"><pre class="highlight"><code> $ mahout spark-trainnb + --input (-i) input Path to job input directory. + --output (-o) output The directory pathname for output. + --trainComplementary (-c) Train complementary? Default is false. + --master (-ma) Spark Master URL (optional). Default: "local". + Note that you can specify the number of + cores to get a performance improvement, + for example "local[4]" + --help (-h) Print out help +</code></pre> + </div> + </li> + <li> + <p><strong>Testing:</strong></p> + + <div class="highlighter-rouge"><pre class="highlight"><code> $ mahout spark-testnb + --input (-i) input Path to job input directory. + --model (-m) model The path to the model built during training. + --testComplementary (-c) Test complementary? Default is false. + --master (-ma) Spark Master URL (optional). Default: "local". + Note that you can specify the number of + cores to get a performance improvement, + for example "local[4]" + --help (-h) Print out help +</code></pre> + </div> + </li> +</ul> + +<h2 id="examples">Examples</h2> +<ol> + <li><a href="https://github.com/apache/mahout/blob/master/examples/bin/classify-20newsgroups.sh">20 Newsgroups classification</a></li> + <li><a href="https://github.com/apache/mahout/blob/master/examples/bin/spark-document-classifier.mscala">Document classification with Naive Bayes in the Mahout shell</a></li> +</ol> + +<h2 id="references">References</h2> + + + </div> + </div> +</div> + <footer class="footer" align="center"> + <div class="container"> + <p> + Copyright © 2014-2016 The Apache Software Foundation, Licensed under + the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>. + <br /> + Apache Mahout, Mahout, Apache, the Apache feather logo, and the elephant rider logo are either registered trademarks or trademarks of <a href="http://www.apache.org/foundation/marks/">The Apache Software Foundation</a> in the United States and other countries. + </p> + </div> + </footer> + + <script src="/assets/themes/mahout-retro/js/jquery-1.9.1.min.js"></script> + <script src="/assets/themes/mahout-retro/js/bootstrap.min.js"></script> + <script> + (function() { + var cx = '012254517474945470291:vhsfv7eokdc'; + var gcse = document.createElement('script'); + gcse.type = 'text/javascript'; + gcse.async = true; + gcse.src = (document.location.protocol == 'https:' ? 'https:' : 'http:') + + '//www.google.com/cse/cse.js?cx=' + cx; + var s = document.getElementsByTagName('script')[0]; + s.parentNode.insertBefore(gcse, s); + })(); + </script> +</body> +</html> +
