http://git-wip-us.apache.org/repos/asf/mahout/blob/5112e9ec/docs/latest/tutorials/map-reduce/classification/twenty-newsgroups.html ---------------------------------------------------------------------- diff --git a/docs/latest/tutorials/map-reduce/classification/twenty-newsgroups.html b/docs/latest/tutorials/map-reduce/classification/twenty-newsgroups.html index 9cefe9f..240f2e8 100644 --- a/docs/latest/tutorials/map-reduce/classification/twenty-newsgroups.html +++ b/docs/latest/tutorials/map-reduce/classification/twenty-newsgroups.html @@ -1,283 +1,169 @@ - - <!DOCTYPE html> -<html lang="en"> +<html lang=" en "> + <head> <meta charset="utf-8"> <meta http-equiv="X-UA-Compatible" content="IE=edge"> + <meta name="viewport" content="width=device-width, initial-scale=1"> - <title>(Deprecated) Twenty Newsgroups</title> - - <meta name="author" content="Apache Mahout"> - - <!-- Enable responsive viewport --> - <meta name="viewport" content="width=device-width, initial-scale=1.0"> - - <!-- Bootstrap styles --> - <link href="/assets/themes/mahout3/css/bootstrap.min.css" rel="stylesheet"> - <!-- Optional theme --> - <link href="/assets/themes/mahout3/css/bootstrap-theme.min.css" rel="stylesheet"> - <!-- Sticky Footer --> - <link href="/assets/themes/mahout3/css/bs-sticky-footer.css" rel="stylesheet"> - - <!-- Custom styles --> - <link href="/assets/themes/mahout3/css/style.css" rel="stylesheet" type="text/css" media="all"> - - <!-- HTML5 Shim and Respond.js IE8 support of HTML5 elements and media queries --> - <!-- WARNING: Respond.js doesn't work if you view the page via file:// --> - <!--[if lt IE 9]> - <script src="https://oss.maxcdn.com/libs/html5shiv/3.7.0/html5shiv.js"></script> - <script src="https://oss.maxcdn.com/libs/respond.js/1.3.0/respond.min.js"></script> - <![endif]--> - - <!-- Fav and touch icons --> - <!-- Update these with your own images - <link rel="shortcut icon" href="images/favicon.ico"> - <link rel="apple-touch-icon" href="images/apple-touch-icon.png"> - <link rel="apple-touch-icon" sizes="72x72" href="images/apple-touch-icon-72x72.png"> - <link rel="apple-touch-icon" sizes="114x114" href="images/apple-touch-icon-114x114.png"> - --> - - <!-- atom & rss feed --> - <link href="/atom.xml" type="application/atom+xml" rel="alternate" title="Sitewide ATOM Feed"> - <link href="/rss.xml" type="application/rss+xml" rel="alternate" title="Sitewide RSS Feed"> - <script type="text/x-mathjax-config"> - MathJax.Hub.Config({ - tex2jax: { - skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] - } - }); - MathJax.Hub.Queue(function() { - var all = MathJax.Hub.getAllJax(), i; - for(i = 0; i < all.length; i += 1) { - all[i].SourceElement().parentNode.className += ' has-jax'; - } - }); - </script> - <script type="text/javascript"> - var mathjax = document.createElement('script'); - mathjax.type = 'text/javascript'; - mathjax.async = true; - - mathjax.src = ('https:' == document.location.protocol) ? - 'https://c328740.ssl.cf1.rackcdn.com/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML' : - 'http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'; - - var s = document.getElementsByTagName('script')[0]; - s.parentNode.insertBefore(mathjax, s); - </script> -</head> - -<nav class="navbar navbar-default navbar-fixed-top"> - <div class="container-fluid"> - <!-- Brand and toggle get grouped for better mobile display --> - <div class="navbar-header"> - <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse-1" aria-expanded="false"> - <span class="sr-only">Toggle navigation</span> - <span class="icon-bar"></span> - <span class="icon-bar"></span> - <span class="icon-bar"></span> - </button> - <a class="navbar-brand" href="/"> - <img src="/assets/img/Mahout-logo-82x100.png" height="30" alt="I'm mahout"> - </a> - </div> - + <title> + (Deprecated) Twenty Newsgroups + </title> + <meta name="description" content="Distributed Linear Algebra"> -<!-- Collect the nav links, forms, and other content for toggling --> -<div class="collapse navbar-collapse" id="main-navbar"> - <ul class="nav navbar-nav"> - - <!-- Quick Start --> - <li id="quickstart"> - <a href="/index.html" >Mahout Overview</a> - </li> - - <li id="dropdown"> - <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false">Key Concepts<span class="caret"></span></a> - <ul class="dropdown-menu"> - <li><a href="/index.html">Mahout Overview</a></li> - <li><span><b> Scala DSL</b><span></li> - <li><a href="/mahout-samsara/in-core-reference.html">In-core Reference</a></li> - <li><a href="/mahout-samsara/out-of-core-reference.html">Out-of-core Reference</a></li> - <li><a href="/mahout-samsara/faq.html">Samsara FAQ</a></li> - <li role="separator" class="divider"></li> - <li><span> <b>Bindings</b><span></li> - <li><a href="/distributed/spark-bindings/">Spark Bindings</a></li> - <li><a href="/distributed/flink-bindings.html">Flink Bindings</a></li> - <li><a href="/distributed/flink-bindings.html">H20 Bindings</a></li> - <!--<li role="separator" class="divider"></li> - <li><span> <b>Native Solvers</b><span></li> - <li><a href="/native-solvers/viennacl.html">ViennaCL</a></li> - <li><a href="/native-solvers/viennacl-omp.html">ViennaCL-OMP</a></li> - <li><a href="/native-solvers/cuda.html">CUDA</a></li>--> - </ul> - </li> - - <li id="dropdown"> - <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false">Tutorials<span class="caret"></span></a> - <ul class="dropdown-menu"> - <li><span> <b>Reccomenders</b><span></li> - <li><a href="/tutorials/cco-lastfm">CCO Example with Last.FM Data</a></li> - <li><a href="/tutorials/intro-cooccurrence-spark">Introduction to Cooccurrence in Spark</a></li> - <li role="separator" class="divider"></li> - <li><span> <b>Mahout Samsara</b><span></li> - <li><a href="/tutorials/samsara/play-with-shell.html">Playing with Samsara in Spark Shell</a></li> - <li><a href="/tutorials/samsara/playing-with-samsara-flink-batch.html">Playing with Samsara in Flink Batch</a></li> - <li><a href="/tutorials/samsara/classify-a-doc-from-the-shell.html">Text Classification (Shell)</a></li> - <li><a href="/tutorials/samsara/spark-naive-bayes.html">Spark Naive Bayes</a></li> - <li role="separator" class="divider"></li> - <li><span> <b>Misc</b><span></li> - <li><a href="/tutorials/misc/mahout-in-zeppelin">Mahout in Apache Zeppelin</a></li> - <li><a href="/tutorials/misc/contributing-algos">How To Contribute a New Algorithm</a></li> - <li><a href="/tutorials/misc/how-to-build-an-app.html">How To Build An App</a></li> - <li role="separator" class="divider"></li> - <li><span> <b>Deprecated</b><span></li> - <li><a href="/tutorials/map-reduce">MapReduce</a></li> - </ul> - </li> - - - <!-- Algorithms (Samsara / MR) --> - <li id="dropdown"> - <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false">Algorithms<span class="caret"></span></a> - <ul class="dropdown-menu"> - <li><a href="/algorithms/linear-algebra">Distributed Linear Algebra</a></li> - <li><a href="/algorithms/preprocessors">Preprocessors</a></li> - <li><a href="/algorithms/regression">Regression</a></li> - <li><a href="/algorithms/reccomenders">Reccomenders</a></li> - <li role="separator" class="divider"></li> - <li><a href="/algorithms/map-reduce">MapReduce <i>(deprecated)</i></a></li> - </ul> - <!--<li><a href="/algorithms/reccomenders/recommender-overview.html">Reccomender Overview</a></li> Do we still need? seems like short version of next post--> - <!-- - <li><a href="/algorithms/reccomenders/intro-cooccurrence-spark.html">Intro to Coocurrence With Spark</a></li> - <li role="separator" class="divider"></li> - <li><span> <a href="/algorithms/map-reduce"><b>MapReduce</b> (deprecated)</a><span></li> + <link rel="stylesheet" href="/assets/css/main.css"> + <!-- Font Awesome --> + <link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css" rel="stylesheet" integrity="sha384-wvfXpqpZZVQGK6TAh5PVlGOfQNHSoD2xbE+QkPxCAFlNEevoEH3Sl0sibVcOQVnN" crossorigin="anonymous"> - --> - </li> + <!-- Google Fonts --> + <link href="https://fonts.googleapis.com/css?family=Maven+Pro:400,500" rel="stylesheet"> + <link href="https://fonts.googleapis.com/css?family=Muli:400,400i,700,700i" rel="stylesheet"> - <!-- Scala Docs --> - <li id="dropdown"> - <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false">API Docs<span class="caret"></span></a> - <ul class="dropdown-menu"> - <li><a href="/0.13.0/api/index.html">0.13.0</a></li> - </ul> - </li> + <link rel="canonical" href="http://mahout.apache.org//docs/latest/tutorials/map-reduce/classification/twenty-newsgroups.html"> + <link rel="alternate" type="application/rss+xml" title="Apache Mahout" href="/%20/feed.xml"> - </ul> - <form class="navbar-form navbar-left"> - <div class="form-group"> - <input type="text" class="form-control" placeholder="Search"> - </div> - <button type="submit" class="btn btn-default">Submit</button> - </form> - <ul class="nav navbar-nav navbar-right"> - <li><a href="http://github.com/apache/mahout">Github</a></li> - - <!-- Apache --> - <li class="dropdown"> - <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false">Apache <span class="caret"></span></a> - <ul class="dropdown-menu"> - <li><a href="http://www.apache.org/foundation/how-it-works.html">Apache Software Foundation</a></li> - <li><a href="http://www.apache.org/licenses/">Apache License</a></li> - <li><a href="http://www.apache.org/foundation/sponsorship.html">Sponsorship</a></li> - <li><a href="http://www.apache.org/foundation/thanks.html">Thanks</a></li> - </ul> - </li> - - </ul> -</div><!-- /.navbar-collapse --> +</head> - </div><!-- /.container-fluid --> -</nav> <body> -<div id="wrap"> - <body class=""> + <nav class="navbar navbar-expand-lg navbar-light bg-light navbar-mahout"> + + <div class="container"> + + <a class="navbar-brand" href="/"> + <img src="/assets/mahout-logo-blue.svg" alt=""> + </a> + + <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarSupportedContent" aria-controls="navbarSupportedContent" aria-expanded="false" aria-label="Toggle navigation"> + <span class="navbar-toggler-icon"></span> + </button> + + <div class="collapse navbar-collapse" id="navbarSupportedContent"> + + <div class="navbar-nav ml-auto"> + + <!-- Quick Start --> + <li class="nav-item"> + <a class="nav-link" href="//docs/latest/" >Mahout Overview</a> + </li> + + <li class="nav-item dropdown"> + <a class="nav-link dropdown-toggle" href="" id="navbarDropdownMenuLink" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Key Concepts</a> + <div class="dropdown-menu" aria-labelledby="navbarDropdownMenuLink"> + <a class="dropdown-item" href="/docs/latest/index.html">Mahout Overview</a> + <div class="dropdown-divider"></div> + <h6 class="dropdown-header">Scala DSL</h6> + <a class="dropdown-item" href="/docs/latest/mahout-samsara/in-core-reference.html">In-core Reference</a> + <a class="dropdown-item" href="/docs/latest/mahout-samsara/out-of-core-reference.html">Out-of-core Reference</a> + <a class="dropdown-item" href="/docs/latest/mahout-samsara/faq.html">Samsara FAQ</a> + <div class="dropdown-divider"></div> + <h6 class="dropdown-header">Distributed Engine Bindings</h6> + <a class="dropdown-item" href="/docs/latest/distributed/spark-bindings/">Spark Bindings</a> + <a class="dropdown-item" href="/docs/latest/distributed/flink-bindings.html">Flink Bindings</a> + <a class="dropdown-item" href="/docs/latest/distributed/flink-bindings.html">H20 Bindings</a> + <!--<div class="dropdown-divider"></div> + <h6 class="dropdown-header">Native Solvers</h6> + <a class="dropdown-item" href="/docs/latest/native-solvers/viennacl.html">ViennaCL</a></li> + <a class="dropdown-item" href="/docs/latest/native-solvers/viennacl-omp.html">ViennaCL-OMP</a></li> + <a class="dropdown-item" href="/docs/latest/native-solvers/cuda.html">CUDA</a></li>--> + </div> + </li> + + <li class="nav-item dropdown"> + <a class="nav-link dropdown-toggle" href="" id="navbarDropdownMenuLink" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Tutorial</a> + <div class="dropdown-menu" aria-labelledby="navbarDropdownMenuLink"> + <div class="dropdown-divider"></div> + <h6 class="dropdown-header">Reccomenders</h6> + <a class="dropdown-item" href="/docs/latest/tutorials/cco-lastfm">CCO Example with Last.FM Data</a> + <a class="dropdown-item" href="/docs/latest/tutorials/intro-cooccurrence-spark">Introduction to Cooccurrence in Spark</a> + <div class="dropdown-divider"></div> + <h6 class="dropdown-header">Mahout Samsara</h6> + <a class="dropdown-item" href="/docs/latest/tutorials/samsara/play-with-shell.html">Playing with Samsara in Spark Shell</a> + <a class="dropdown-item" href="/docs/latest/tutorials/samsara/playing-with-samsara-flink-batch.html">Playing with Samsara in Flink Batch</a> + <a class="dropdown-item" href="/docs/latest/tutorials/samsara/classify-a-doc-from-the-shell.html">Text Classification (Shell)</a> + <a class="dropdown-item" href="/docs/latest/tutorials/samsara/spark-naive-bayes.html">Spark Naive Bayes</a> + <div class="dropdown-divider"></div> + <h6 class="dropdown-header">Misc</h6> + <a class="dropdown-item" href="/docs/latest/tutorials/misc/mahout-in-zeppelin">Mahout in Apache Zeppelin</a> + <a class="dropdown-item" href="/docs/latest/tutorials/misc/contributing-algos">How To Contribute a New Algorithm</a> + <a class="dropdown-item" href="/docs/latest/tutorials/misc/how-to-build-an-app.html">How To Build An App</a> + <div class="dropdown-divider"></div> + <h6 class="dropdown-header">Deprecated</h6> + <a class="dropdown-item" href="/docs/latest/tutorials/map-reduce">MapReduce</a> + </div> + </li> + + + <!-- Algorithms (Samsara / MR) --> + <li class="nav-item dropdown"> + <a class="nav-link dropdown-toggle" href="" id="navbarDropdownMenuLink" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Algorithms</a> + <div class="dropdown-menu" aria-labelledby="navbarDropdownMenuLink"> + <a class="dropdown-item" href="/docs/latest/algorithms/linear-algebra">Distributed Linear Algebra</a> + <a class="dropdown-item" href="/docs/latest/algorithms/preprocessors">Preprocessors</a> + <a class="dropdown-item" href="/docs/latest/algorithms/regression">Regression</a> + <a class="dropdown-item" href="/docs/latest/algorithms/reccomenders">Reccomenders</a> + <div class="dropdown-divider"></div> + <h6 class="dropdown-header">Deprecated</h6> + <a class="dropdown-item" href="/docs/latest/algorithms/map-reduce">MapReduce <i>(deprecated)</i></a> + </div> + <!--<a class="dropdown-item" href="/docs/latest/algorithms/reccomenders/recommender-overview.html">Reccomender Overview</a></li> Do we still need? seems like short version of next post--> + <!-- + <a class="dropdown-item" href="/docs/latest/algorithms/reccomenders/intro-cooccurrence-spark.html">Intro to Coocurrence With Spark</a></li> + <li role="separator" class="divider"></li> + <li><span> <a href="/docs/latest/algorithms/map-reduce"><b>MapReduce</b> (deprecated)</a><span></li> + + + --> + </li> + + <!-- Scala /docs --> + <li class="nav-item dropdown"> + <a class="nav-link dropdown-toggle" href="" id="navbarDropdownMenuLink" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">API /docs</a> + <div class="dropdown-menu" aria-labelledby="navbarDropdownMenuLink"> + <a class="dropdown-item" href="/docs/latest/0.13.0/api/index.html">0.13.0</a> + </div> + </li> + + <!-- Apache --> + <li class="nav-item dropdown"> + <a class="nav-link dropdown-toggle" href="" id="navbarDropdownMenuLink" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Apache</a> + <div class="dropdown-menu" aria-labelledby="navbarDropdownMenuLink"> + <a class="dropdown-item" href="http://www.apache.org/foundation/how-it-works.html">Apache Software Foundation</a> + <a class="dropdown-item" href="http://www.apache.org/licenses/">Apache License</a> + <a class="dropdown-item" href="http://www.apache.org/foundation/sponsorship.html">Sponsorship</a> + <a class="dropdown-item" href="http://www.apache.org/foundation/thanks.html">Thanks</a> + </div> + </li> - <div class="container"> - + </ul> + + <!--<form class="navbar-form navbar-left">--> + <!--<div class="form-group">--> + <!--<input type="text" class="form-control" placeholder="Search">--> + <!--</div>--> + <!--<button type="submit" class="btn btn-default">Submit</button>--> + <!--</form>--> + <!--<ul class="nav navbar-nav navbar-right">--> + <!--<a class="dropdown-item" href="http://github.com/apache/mahout">Github</a></li>--> -<div class="row"> - <div class="col-xs-3"> - <div id="TutorialMenu"> - <span><b>Tutorials</b></span> - <div class="list-group panel"> - <a href="#linalg" class="list-group-item list-group-item-success" data-toggle="collapse" data-parent="#TutorialMenu"><b>Linear Algebra</b><i class="fa fa-caret-down"></i></a> - <div class="collapse" id="linalg"> - <ul class="nav sidebar-nav"> - <li><a href="/tutorials/eigenfaces">Eigenfaces Demo (Shell or Zeppelin)</a></li> - </ul> - </div> - <a href="#reccomenders" class="list-group-item list-group-item-success" data-toggle="collapse" data-parent="#TutorialMenu"><b>Reccomenders</b><i class="fa fa-caret-down"></i></a> - <div class="collapse" id="reccomenders"> - <ul class="nav sidebar-nav"> - <li><a href="/tutorials/cco-lastfm">CCO Example with Last.FM Data</a></li> - <li><a href="/tutorials/intro-cooccurrence-spark">Introduction to Cooccurrence in Spark</a></li> - </ul> - </div> - <a href="#other" class="list-group-item list-group-item-success" data-toggle="collapse" data-parent="#TutorialMenu"><b>Other</b><i class="fa fa-caret-down"></i></a> - <div class="collapse" id="other"> - <ul class="nav sidebar-nav"> - <li><a href="/tutorials/misc/mahout-in-zeppelin">Mahout in Apache Zeppelin</a></li> - <li><a href="/tutorials/misc/contributing-algos">How To Contribute a New Algorithm</a></li> - <li><a href="/tutorials/misc/how-to-build-an-app.html">How To Build An App</a></li> - </ul> - </div> - </div> - <span><b>Map Reduce Tutorials</b> (deprecated)</span> - <div class="list-group panel"> - <a href="#classification" class="list-group-item list-group-item-success" data-toggle="collapse" data-parent="#MrTutorialMenu"><b>Classification</b><i class="fa fa-caret-down"></i></a> - <div class="collapse" id="classification"> - <ul class="nav sidebar-nav"> - <li> <a href="/tutorials/map-reduce/classification/bankmarketing-example.html">Bank Marketing Example</a></li> - <li> <a href="/tutorials/map-reduce/classification/breiman-example.html">Breiman Example</a></li> - <li> <a href="/tutorials/map-reduce/classification/twenty-newsgroups.html">Twenty Newsgroups Example</a></li> - <li> <a href="/tutorials/map-reduce/classification/wikipedia-classifier-example.html">Wikipedia Classifier Example</a></li> - <li> <a href="/tutorials/map-reduce/classification/parallel-frequent-pattern-mining.html">Parallel Frequent Pattern Mining</a></li> - </ul> - </div> - <a href="#clustering" class="list-group-item list-group-item-success" data-toggle="collapse" data-parent="#MrTutorialMenu"><b>Clustering</b><i class="fa fa-caret-down"></i></a> - <div class="collapse" id="clustering"> - <ul class="nav sidebar-nav"> - <li> <a href="/tutorials/map-reduce/clustering/20newsgroups.html">Twenty Newsgroups Example</a></li> - <li> <a href="/tutorials/map-reduce/clustering/canopy-commandline.html">Canopy Clustering from the Commandline</a></li> - <li> <a href="/tutorials/map-reduce/clustering/clustering-of-synthetic-control-data.html">Clustering of Synthetic Control Data</a></li> - <li> <a href="/tutorials/map-reduce/clustering/clustering-seinfeld-episodes.html">Clustering of Seinfeld Episodes</a></li> - <li> <a href="/tutorials/map-reduce/clustering/clusteringyourdata.html">Clustering Your Data</a></li> - <li> <a href="/tutorials/map-reduce/clustering/fuzzy-k-means-commandline.html">Fuzzy K-Means from the Commandline</a></li> - <li> <a href="/tutorials/map-reduce/clustering/k-means-commandline.html">K-Means from the Commandline</a></li> - <li> <a href="/tutorials/map-reduce/clustering/lda-commandline.html">LDA from the Commandline</a></li> - <li> <a href="/tutorials/map-reduce/clustering/viewing-results.html">Viewing Results</a></li> - <li> <a href="/tutorials/map-reduce/clustering/visualizing-sample-clusters.html">Visualizing Sample Clusters</a></li> - </ul> - </div> - <a href="#misc" class="list-group-item list-group-item-success" data-toggle="collapse" data-parent="#MrTutorialMenu"><b>Miscelaneous</b><i class="fa fa-caret-down"></i></a> - <div class="collapse" id="misc"> - <ul class="nav sidebar-nav"> - <li> <a href="/tutorials/map-reduce/misc/mr---map-reduce.html">MR Map-Reduce</a></li> - <li> <a href="/tutorials/map-reduce/misc/parallel-frequent-pattern-mining.html">Parallel Frequent Pattern Mining</a></li> - <li> <a href="/tutorials/map-reduce/misc/using-mahout-with-python-via-jpype.html">Using Mahout (Map Reduce) with Python via Jpype</a></li> - </ul> - </div> - </div> -</div> + <!--</ul>--> + </div><!-- /.navbar-collapse --> </div> +</nav> - <div class="col-xs-8"> - <div class="page-header"> - <h1>(Deprecated) Twenty Newsgroups </h1> - </div> - <p><a name="TwentyNewsgroups-TwentyNewsgroupsClassificationExample"></a></p> + <div class="container mt-5 pb-4"> + + <div class="row"> + + <div class="col-lg-8"> + <p><a name="TwentyNewsgroups-TwentyNewsgroupsClassificationExample"></a></p> <h2 id="twenty-newsgroups-classification-example">Twenty Newsgroups Classification Example</h2> <p><a name="TwentyNewsgroups-Introduction"></a></p> @@ -312,35 +198,40 @@ the 20 newsgroups.</p> <li> <p>If running Hadoop in cluster mode, start the hadoop daemons by executing the following commands:</p> - <pre><code> $ cd $HADOOP_HOME/bin + <div class="highlighter-rouge"><pre class="highlight"><code> $ cd $HADOOP_HOME/bin $ ./start-all.sh </code></pre> + </div> <p>Otherwise:</p> - <pre><code> $ export MAHOUT_LOCAL=true + <div class="highlighter-rouge"><pre class="highlight"><code> $ export MAHOUT_LOCAL=true </code></pre> + </div> </li> <li> <p>In the trunk directory of Mahout, compile and install Mahout:</p> - <pre><code> $ cd $MAHOUT_HOME + <div class="highlighter-rouge"><pre class="highlight"><code> $ cd $MAHOUT_HOME $ mvn -DskipTests clean install </code></pre> + </div> </li> <li> <p>Run the <a href="https://github.com/apache/mahout/blob/master/examples/bin/classify-20newsgroups.sh">20 newsgroups example script</a> by executing:</p> - <pre><code> $ ./examples/bin/classify-20newsgroups.sh + <div class="highlighter-rouge"><pre class="highlight"><code> $ ./examples/bin/classify-20newsgroups.sh </code></pre> + </div> </li> <li> <p>You will be prompted to select a classification method algorithm:</p> - <pre><code> 1. Complement Naive Bayes + <div class="highlighter-rouge"><pre class="highlight"><code> 1. Complement Naive Bayes 2. Naive Bayes 3. Stochastic Gradient Descent </code></pre> + </div> </li> </ol> @@ -358,7 +249,7 @@ the 20 newsgroups.</p> <p>Output should look something like:</p> -<pre><code>======================================================= +<div class="highlighter-rouge"><pre class="highlight"><code>======================================================= Confusion Matrix ------------------------------------------------------- a b c d e f g h i j k l m n o p q r s t <--Classified as @@ -390,6 +281,7 @@ Accuracy 90.8596% Reliability 86.3632% Reliability (standard deviation) 0.2131 </code></pre> +</div> <p><a name="TwentyNewsgroups-ComplementaryNaiveBayes"></a></p> <h2 id="end-to-end-commands-to-build-a-cbayes-model-for-20-newsgroups">End to end commands to build a CBayes model for 20 newsgroups</h2> @@ -401,14 +293,15 @@ Reliability (standard deviation) 0.2131 <li> <p>Create a working directory for the dataset and all input/output.</p> - <pre><code> $ export WORK_DIR=/tmp/mahout-work-${USER} + <div class="highlighter-rouge"><pre class="highlight"><code> $ export WORK_DIR=/tmp/mahout-work-${USER} $ mkdir -p ${WORK_DIR} </code></pre> + </div> </li> <li> <p>Download and extract the <em>20news-bydate.tar.gz</em> from the <a href="http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz">20newsgroups dataset</a> to the working directory.</p> - <pre><code> $ curl http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz + <div class="highlighter-rouge"><pre class="highlight"><code> $ curl http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz -o ${WORK_DIR}/20news-bydate.tar.gz $ mkdir -p ${WORK_DIR}/20news-bydate $ cd ${WORK_DIR}/20news-bydate && tar xzf ../20news-bydate.tar.gz && cd .. && cd .. @@ -417,42 +310,46 @@ Reliability (standard deviation) 0.2131 $ hadoop dfs -put ${WORK_DIR}/20news-all ${WORK_DIR}/20news-all </code></pre> + </div> </li> <li> <p>Convert the full 20 newsgroups dataset into a < Text, Text > SequenceFile.</p> - <pre><code> $ mahout seqdirectory + <div class="highlighter-rouge"><pre class="highlight"><code> $ mahout seqdirectory -i ${WORK_DIR}/20news-all -o ${WORK_DIR}/20news-seq -ow </code></pre> + </div> </li> <li> <p>Convert and preprocesses the dataset into a < Text, VectorWritable > SequenceFile containing term frequencies for each document.</p> - <pre><code> $ mahout seq2sparse + <div class="highlighter-rouge"><pre class="highlight"><code> $ mahout seq2sparse -i ${WORK_DIR}/20news-seq -o ${WORK_DIR}/20news-vectors -lnorm -nv -wt tfidf If we wanted to use different parsing methods or transformations on the term frequency vectors we could supply different options here e.g.: -ng 2 for bigrams or -n 2 for L2 length normalization. See the [Creating vectors from text](http://mahout.apache.org/users/basics/creating-vectors-from-text.html) page for a list of all seq2sparse options. </code></pre> + </div> </li> <li> <p>Split the preprocessed dataset into training and testing sets.</p> - <pre><code> $ mahout split + <div class="highlighter-rouge"><pre class="highlight"><code> $ mahout split -i ${WORK_DIR}/20news-vectors/tfidf-vectors --trainingOutput ${WORK_DIR}/20news-train-vectors --testOutput ${WORK_DIR}/20news-test-vectors --randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential </code></pre> + </div> </li> <li> <p>Train the classifier.</p> - <pre><code> $ mahout trainnb + <div class="highlighter-rouge"><pre class="highlight"><code> $ mahout trainnb -i ${WORK_DIR}/20news-train-vectors -el -o ${WORK_DIR}/model @@ -460,11 +357,12 @@ Reliability (standard deviation) 0.2131 -ow -c </code></pre> + </div> </li> <li> <p>Test the classifier.</p> - <pre><code> $ mahout testnb + <div class="highlighter-rouge"><pre class="highlight"><code> $ mahout testnb -i ${WORK_DIR}/20news-test-vectors -m ${WORK_DIR}/model -l ${WORK_DIR}/labelindex @@ -472,37 +370,31 @@ Reliability (standard deviation) 0.2131 -o ${WORK_DIR}/20news-testing -c </code></pre> + </div> </li> </ol> </div> -</div> - - </div> -</div> - -<div id="footer"> - <div class="container"> - <p>© 2017 Apache Mahout - with help from <a href="http://jekyllbootstrap.com" target="_blank" title="The Definitive Jekyll Blogging Framework">Jekyll Bootstrap</a> - and <a href="http://getbootstrap.com" target="_blank">Bootstrap</a> - </p> </div> -</div> - - +</div> + <footer class="footer bg-light"> + <div class="container text-center small"> + Copyright © 2014-2017 The Apache Software Foundation, Licensed under the Apache License, Version 2.0. + </div> +</footer> + <script src="/assets/vendor/jquery/jquery-slim.min.js"></script> + <script src="/assets/vendor/popper/popper.min.js"></script> + <script src="/assets/vendor/bootstrap/js/bootstrap.min.js"></script> + <script src="/assets/header.js"></script> + <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-AMS-MML_HTMLorMML" type="text/javascript"></script> -<!-- Latest compiled and minified JavaScript, requires jQuery 1.x (2.x not supported in IE8) --> -<!-- Placed at the end of the document so the pages load faster --> -<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.10.2/jquery.min.js"></script> -<script src="/assets/themes/mahout3/js/bootstrap.min.js"></script> </body> -</html> +</html>
http://git-wip-us.apache.org/repos/asf/mahout/blob/5112e9ec/docs/latest/tutorials/map-reduce/classification/wikipedia-classifier-example.html ---------------------------------------------------------------------- diff --git a/docs/latest/tutorials/map-reduce/classification/wikipedia-classifier-example.html b/docs/latest/tutorials/map-reduce/classification/wikipedia-classifier-example.html index 79ecc0b..aa5e20b 100644 --- a/docs/latest/tutorials/map-reduce/classification/wikipedia-classifier-example.html +++ b/docs/latest/tutorials/map-reduce/classification/wikipedia-classifier-example.html @@ -1,283 +1,169 @@ - - <!DOCTYPE html> -<html lang="en"> +<html lang=" en "> + <head> <meta charset="utf-8"> <meta http-equiv="X-UA-Compatible" content="IE=edge"> + <meta name="viewport" content="width=device-width, initial-scale=1"> - <title>(Deprecated) Wikipedia XML parser and Naive Bayes Example</title> - - <meta name="author" content="Apache Mahout"> - - <!-- Enable responsive viewport --> - <meta name="viewport" content="width=device-width, initial-scale=1.0"> - - <!-- Bootstrap styles --> - <link href="/assets/themes/mahout3/css/bootstrap.min.css" rel="stylesheet"> - <!-- Optional theme --> - <link href="/assets/themes/mahout3/css/bootstrap-theme.min.css" rel="stylesheet"> - <!-- Sticky Footer --> - <link href="/assets/themes/mahout3/css/bs-sticky-footer.css" rel="stylesheet"> - - <!-- Custom styles --> - <link href="/assets/themes/mahout3/css/style.css" rel="stylesheet" type="text/css" media="all"> - - <!-- HTML5 Shim and Respond.js IE8 support of HTML5 elements and media queries --> - <!-- WARNING: Respond.js doesn't work if you view the page via file:// --> - <!--[if lt IE 9]> - <script src="https://oss.maxcdn.com/libs/html5shiv/3.7.0/html5shiv.js"></script> - <script src="https://oss.maxcdn.com/libs/respond.js/1.3.0/respond.min.js"></script> - <![endif]--> - - <!-- Fav and touch icons --> - <!-- Update these with your own images - <link rel="shortcut icon" href="images/favicon.ico"> - <link rel="apple-touch-icon" href="images/apple-touch-icon.png"> - <link rel="apple-touch-icon" sizes="72x72" href="images/apple-touch-icon-72x72.png"> - <link rel="apple-touch-icon" sizes="114x114" href="images/apple-touch-icon-114x114.png"> - --> - - <!-- atom & rss feed --> - <link href="/atom.xml" type="application/atom+xml" rel="alternate" title="Sitewide ATOM Feed"> - <link href="/rss.xml" type="application/rss+xml" rel="alternate" title="Sitewide RSS Feed"> - <script type="text/x-mathjax-config"> - MathJax.Hub.Config({ - tex2jax: { - skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] - } - }); - MathJax.Hub.Queue(function() { - var all = MathJax.Hub.getAllJax(), i; - for(i = 0; i < all.length; i += 1) { - all[i].SourceElement().parentNode.className += ' has-jax'; - } - }); - </script> - <script type="text/javascript"> - var mathjax = document.createElement('script'); - mathjax.type = 'text/javascript'; - mathjax.async = true; - - mathjax.src = ('https:' == document.location.protocol) ? - 'https://c328740.ssl.cf1.rackcdn.com/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML' : - 'http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'; - - var s = document.getElementsByTagName('script')[0]; - s.parentNode.insertBefore(mathjax, s); - </script> -</head> - -<nav class="navbar navbar-default navbar-fixed-top"> - <div class="container-fluid"> - <!-- Brand and toggle get grouped for better mobile display --> - <div class="navbar-header"> - <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse-1" aria-expanded="false"> - <span class="sr-only">Toggle navigation</span> - <span class="icon-bar"></span> - <span class="icon-bar"></span> - <span class="icon-bar"></span> - </button> - <a class="navbar-brand" href="/"> - <img src="/assets/img/Mahout-logo-82x100.png" height="30" alt="I'm mahout"> - </a> - </div> - + <title> + (Deprecated) Wikipedia XML parser and Naive Bayes Example + </title> + <meta name="description" content="Distributed Linear Algebra"> -<!-- Collect the nav links, forms, and other content for toggling --> -<div class="collapse navbar-collapse" id="main-navbar"> - <ul class="nav navbar-nav"> - - <!-- Quick Start --> - <li id="quickstart"> - <a href="/index.html" >Mahout Overview</a> - </li> - - <li id="dropdown"> - <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false">Key Concepts<span class="caret"></span></a> - <ul class="dropdown-menu"> - <li><a href="/index.html">Mahout Overview</a></li> - <li><span><b> Scala DSL</b><span></li> - <li><a href="/mahout-samsara/in-core-reference.html">In-core Reference</a></li> - <li><a href="/mahout-samsara/out-of-core-reference.html">Out-of-core Reference</a></li> - <li><a href="/mahout-samsara/faq.html">Samsara FAQ</a></li> - <li role="separator" class="divider"></li> - <li><span> <b>Bindings</b><span></li> - <li><a href="/distributed/spark-bindings/">Spark Bindings</a></li> - <li><a href="/distributed/flink-bindings.html">Flink Bindings</a></li> - <li><a href="/distributed/flink-bindings.html">H20 Bindings</a></li> - <!--<li role="separator" class="divider"></li> - <li><span> <b>Native Solvers</b><span></li> - <li><a href="/native-solvers/viennacl.html">ViennaCL</a></li> - <li><a href="/native-solvers/viennacl-omp.html">ViennaCL-OMP</a></li> - <li><a href="/native-solvers/cuda.html">CUDA</a></li>--> - </ul> - </li> - - <li id="dropdown"> - <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false">Tutorials<span class="caret"></span></a> - <ul class="dropdown-menu"> - <li><span> <b>Reccomenders</b><span></li> - <li><a href="/tutorials/cco-lastfm">CCO Example with Last.FM Data</a></li> - <li><a href="/tutorials/intro-cooccurrence-spark">Introduction to Cooccurrence in Spark</a></li> - <li role="separator" class="divider"></li> - <li><span> <b>Mahout Samsara</b><span></li> - <li><a href="/tutorials/samsara/play-with-shell.html">Playing with Samsara in Spark Shell</a></li> - <li><a href="/tutorials/samsara/playing-with-samsara-flink-batch.html">Playing with Samsara in Flink Batch</a></li> - <li><a href="/tutorials/samsara/classify-a-doc-from-the-shell.html">Text Classification (Shell)</a></li> - <li><a href="/tutorials/samsara/spark-naive-bayes.html">Spark Naive Bayes</a></li> - <li role="separator" class="divider"></li> - <li><span> <b>Misc</b><span></li> - <li><a href="/tutorials/misc/mahout-in-zeppelin">Mahout in Apache Zeppelin</a></li> - <li><a href="/tutorials/misc/contributing-algos">How To Contribute a New Algorithm</a></li> - <li><a href="/tutorials/misc/how-to-build-an-app.html">How To Build An App</a></li> - <li role="separator" class="divider"></li> - <li><span> <b>Deprecated</b><span></li> - <li><a href="/tutorials/map-reduce">MapReduce</a></li> - </ul> - </li> - - - <!-- Algorithms (Samsara / MR) --> - <li id="dropdown"> - <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false">Algorithms<span class="caret"></span></a> - <ul class="dropdown-menu"> - <li><a href="/algorithms/linear-algebra">Distributed Linear Algebra</a></li> - <li><a href="/algorithms/preprocessors">Preprocessors</a></li> - <li><a href="/algorithms/regression">Regression</a></li> - <li><a href="/algorithms/reccomenders">Reccomenders</a></li> - <li role="separator" class="divider"></li> - <li><a href="/algorithms/map-reduce">MapReduce <i>(deprecated)</i></a></li> - </ul> - <!--<li><a href="/algorithms/reccomenders/recommender-overview.html">Reccomender Overview</a></li> Do we still need? seems like short version of next post--> - <!-- - <li><a href="/algorithms/reccomenders/intro-cooccurrence-spark.html">Intro to Coocurrence With Spark</a></li> - <li role="separator" class="divider"></li> - <li><span> <a href="/algorithms/map-reduce"><b>MapReduce</b> (deprecated)</a><span></li> + <link rel="stylesheet" href="/assets/css/main.css"> + <!-- Font Awesome --> + <link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css" rel="stylesheet" integrity="sha384-wvfXpqpZZVQGK6TAh5PVlGOfQNHSoD2xbE+QkPxCAFlNEevoEH3Sl0sibVcOQVnN" crossorigin="anonymous"> - --> - </li> + <!-- Google Fonts --> + <link href="https://fonts.googleapis.com/css?family=Maven+Pro:400,500" rel="stylesheet"> + <link href="https://fonts.googleapis.com/css?family=Muli:400,400i,700,700i" rel="stylesheet"> - <!-- Scala Docs --> - <li id="dropdown"> - <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false">API Docs<span class="caret"></span></a> - <ul class="dropdown-menu"> - <li><a href="/0.13.0/api/index.html">0.13.0</a></li> - </ul> - </li> - - - </ul> - <form class="navbar-form navbar-left"> - <div class="form-group"> - <input type="text" class="form-control" placeholder="Search"> - </div> - <button type="submit" class="btn btn-default">Submit</button> - </form> - <ul class="nav navbar-nav navbar-right"> - <li><a href="http://github.com/apache/mahout">Github</a></li> - - <!-- Apache --> - <li class="dropdown"> - <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false">Apache <span class="caret"></span></a> - <ul class="dropdown-menu"> - <li><a href="http://www.apache.org/foundation/how-it-works.html">Apache Software Foundation</a></li> - <li><a href="http://www.apache.org/licenses/">Apache License</a></li> - <li><a href="http://www.apache.org/foundation/sponsorship.html">Sponsorship</a></li> - <li><a href="http://www.apache.org/foundation/thanks.html">Thanks</a></li> - </ul> - </li> + <link rel="canonical" href="http://mahout.apache.org//docs/latest/tutorials/map-reduce/classification/wikipedia-classifier-example.html"> + <link rel="alternate" type="application/rss+xml" title="Apache Mahout" href="/%20/feed.xml"> - </ul> -</div><!-- /.navbar-collapse --> - </div><!-- /.container-fluid --> -</nav> +</head> + <body> -<div id="wrap"> - <body class=""> + <nav class="navbar navbar-expand-lg navbar-light bg-light navbar-mahout"> + + <div class="container"> + + <a class="navbar-brand" href="/"> + <img src="/assets/mahout-logo-blue.svg" alt=""> + </a> + + <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarSupportedContent" aria-controls="navbarSupportedContent" aria-expanded="false" aria-label="Toggle navigation"> + <span class="navbar-toggler-icon"></span> + </button> + + <div class="collapse navbar-collapse" id="navbarSupportedContent"> + + <div class="navbar-nav ml-auto"> + + <!-- Quick Start --> + <li class="nav-item"> + <a class="nav-link" href="//docs/latest/" >Mahout Overview</a> + </li> + + <li class="nav-item dropdown"> + <a class="nav-link dropdown-toggle" href="" id="navbarDropdownMenuLink" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Key Concepts</a> + <div class="dropdown-menu" aria-labelledby="navbarDropdownMenuLink"> + <a class="dropdown-item" href="/docs/latest/index.html">Mahout Overview</a> + <div class="dropdown-divider"></div> + <h6 class="dropdown-header">Scala DSL</h6> + <a class="dropdown-item" href="/docs/latest/mahout-samsara/in-core-reference.html">In-core Reference</a> + <a class="dropdown-item" href="/docs/latest/mahout-samsara/out-of-core-reference.html">Out-of-core Reference</a> + <a class="dropdown-item" href="/docs/latest/mahout-samsara/faq.html">Samsara FAQ</a> + <div class="dropdown-divider"></div> + <h6 class="dropdown-header">Distributed Engine Bindings</h6> + <a class="dropdown-item" href="/docs/latest/distributed/spark-bindings/">Spark Bindings</a> + <a class="dropdown-item" href="/docs/latest/distributed/flink-bindings.html">Flink Bindings</a> + <a class="dropdown-item" href="/docs/latest/distributed/flink-bindings.html">H20 Bindings</a> + <!--<div class="dropdown-divider"></div> + <h6 class="dropdown-header">Native Solvers</h6> + <a class="dropdown-item" href="/docs/latest/native-solvers/viennacl.html">ViennaCL</a></li> + <a class="dropdown-item" href="/docs/latest/native-solvers/viennacl-omp.html">ViennaCL-OMP</a></li> + <a class="dropdown-item" href="/docs/latest/native-solvers/cuda.html">CUDA</a></li>--> + </div> + </li> + + <li class="nav-item dropdown"> + <a class="nav-link dropdown-toggle" href="" id="navbarDropdownMenuLink" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Tutorial</a> + <div class="dropdown-menu" aria-labelledby="navbarDropdownMenuLink"> + <div class="dropdown-divider"></div> + <h6 class="dropdown-header">Reccomenders</h6> + <a class="dropdown-item" href="/docs/latest/tutorials/cco-lastfm">CCO Example with Last.FM Data</a> + <a class="dropdown-item" href="/docs/latest/tutorials/intro-cooccurrence-spark">Introduction to Cooccurrence in Spark</a> + <div class="dropdown-divider"></div> + <h6 class="dropdown-header">Mahout Samsara</h6> + <a class="dropdown-item" href="/docs/latest/tutorials/samsara/play-with-shell.html">Playing with Samsara in Spark Shell</a> + <a class="dropdown-item" href="/docs/latest/tutorials/samsara/playing-with-samsara-flink-batch.html">Playing with Samsara in Flink Batch</a> + <a class="dropdown-item" href="/docs/latest/tutorials/samsara/classify-a-doc-from-the-shell.html">Text Classification (Shell)</a> + <a class="dropdown-item" href="/docs/latest/tutorials/samsara/spark-naive-bayes.html">Spark Naive Bayes</a> + <div class="dropdown-divider"></div> + <h6 class="dropdown-header">Misc</h6> + <a class="dropdown-item" href="/docs/latest/tutorials/misc/mahout-in-zeppelin">Mahout in Apache Zeppelin</a> + <a class="dropdown-item" href="/docs/latest/tutorials/misc/contributing-algos">How To Contribute a New Algorithm</a> + <a class="dropdown-item" href="/docs/latest/tutorials/misc/how-to-build-an-app.html">How To Build An App</a> + <div class="dropdown-divider"></div> + <h6 class="dropdown-header">Deprecated</h6> + <a class="dropdown-item" href="/docs/latest/tutorials/map-reduce">MapReduce</a> + </div> + </li> + + + <!-- Algorithms (Samsara / MR) --> + <li class="nav-item dropdown"> + <a class="nav-link dropdown-toggle" href="" id="navbarDropdownMenuLink" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Algorithms</a> + <div class="dropdown-menu" aria-labelledby="navbarDropdownMenuLink"> + <a class="dropdown-item" href="/docs/latest/algorithms/linear-algebra">Distributed Linear Algebra</a> + <a class="dropdown-item" href="/docs/latest/algorithms/preprocessors">Preprocessors</a> + <a class="dropdown-item" href="/docs/latest/algorithms/regression">Regression</a> + <a class="dropdown-item" href="/docs/latest/algorithms/reccomenders">Reccomenders</a> + <div class="dropdown-divider"></div> + <h6 class="dropdown-header">Deprecated</h6> + <a class="dropdown-item" href="/docs/latest/algorithms/map-reduce">MapReduce <i>(deprecated)</i></a> + </div> + <!--<a class="dropdown-item" href="/docs/latest/algorithms/reccomenders/recommender-overview.html">Reccomender Overview</a></li> Do we still need? seems like short version of next post--> + <!-- + <a class="dropdown-item" href="/docs/latest/algorithms/reccomenders/intro-cooccurrence-spark.html">Intro to Coocurrence With Spark</a></li> + <li role="separator" class="divider"></li> + <li><span> <a href="/docs/latest/algorithms/map-reduce"><b>MapReduce</b> (deprecated)</a><span></li> + + + --> + </li> + + <!-- Scala /docs --> + <li class="nav-item dropdown"> + <a class="nav-link dropdown-toggle" href="" id="navbarDropdownMenuLink" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">API /docs</a> + <div class="dropdown-menu" aria-labelledby="navbarDropdownMenuLink"> + <a class="dropdown-item" href="/docs/latest/0.13.0/api/index.html">0.13.0</a> + </div> + </li> + + <!-- Apache --> + <li class="nav-item dropdown"> + <a class="nav-link dropdown-toggle" href="" id="navbarDropdownMenuLink" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Apache</a> + <div class="dropdown-menu" aria-labelledby="navbarDropdownMenuLink"> + <a class="dropdown-item" href="http://www.apache.org/foundation/how-it-works.html">Apache Software Foundation</a> + <a class="dropdown-item" href="http://www.apache.org/licenses/">Apache License</a> + <a class="dropdown-item" href="http://www.apache.org/foundation/sponsorship.html">Sponsorship</a> + <a class="dropdown-item" href="http://www.apache.org/foundation/thanks.html">Thanks</a> + </div> + </li> - <div class="container"> - + </ul> + <!--<form class="navbar-form navbar-left">--> + <!--<div class="form-group">--> + <!--<input type="text" class="form-control" placeholder="Search">--> + <!--</div>--> + <!--<button type="submit" class="btn btn-default">Submit</button>--> + <!--</form>--> + <!--<ul class="nav navbar-nav navbar-right">--> + <!--<a class="dropdown-item" href="http://github.com/apache/mahout">Github</a></li>--> -<div class="row"> - <div class="col-xs-3"> - <div id="TutorialMenu"> - <span><b>Tutorials</b></span> - <div class="list-group panel"> - <a href="#linalg" class="list-group-item list-group-item-success" data-toggle="collapse" data-parent="#TutorialMenu"><b>Linear Algebra</b><i class="fa fa-caret-down"></i></a> - <div class="collapse" id="linalg"> - <ul class="nav sidebar-nav"> - <li><a href="/tutorials/eigenfaces">Eigenfaces Demo (Shell or Zeppelin)</a></li> - </ul> - </div> - <a href="#reccomenders" class="list-group-item list-group-item-success" data-toggle="collapse" data-parent="#TutorialMenu"><b>Reccomenders</b><i class="fa fa-caret-down"></i></a> - <div class="collapse" id="reccomenders"> - <ul class="nav sidebar-nav"> - <li><a href="/tutorials/cco-lastfm">CCO Example with Last.FM Data</a></li> - <li><a href="/tutorials/intro-cooccurrence-spark">Introduction to Cooccurrence in Spark</a></li> - </ul> - </div> - <a href="#other" class="list-group-item list-group-item-success" data-toggle="collapse" data-parent="#TutorialMenu"><b>Other</b><i class="fa fa-caret-down"></i></a> - <div class="collapse" id="other"> - <ul class="nav sidebar-nav"> - <li><a href="/tutorials/misc/mahout-in-zeppelin">Mahout in Apache Zeppelin</a></li> - <li><a href="/tutorials/misc/contributing-algos">How To Contribute a New Algorithm</a></li> - <li><a href="/tutorials/misc/how-to-build-an-app.html">How To Build An App</a></li> - </ul> - </div> - </div> - <span><b>Map Reduce Tutorials</b> (deprecated)</span> - <div class="list-group panel"> - <a href="#classification" class="list-group-item list-group-item-success" data-toggle="collapse" data-parent="#MrTutorialMenu"><b>Classification</b><i class="fa fa-caret-down"></i></a> - <div class="collapse" id="classification"> - <ul class="nav sidebar-nav"> - <li> <a href="/tutorials/map-reduce/classification/bankmarketing-example.html">Bank Marketing Example</a></li> - <li> <a href="/tutorials/map-reduce/classification/breiman-example.html">Breiman Example</a></li> - <li> <a href="/tutorials/map-reduce/classification/twenty-newsgroups.html">Twenty Newsgroups Example</a></li> - <li> <a href="/tutorials/map-reduce/classification/wikipedia-classifier-example.html">Wikipedia Classifier Example</a></li> - <li> <a href="/tutorials/map-reduce/classification/parallel-frequent-pattern-mining.html">Parallel Frequent Pattern Mining</a></li> - </ul> - </div> - <a href="#clustering" class="list-group-item list-group-item-success" data-toggle="collapse" data-parent="#MrTutorialMenu"><b>Clustering</b><i class="fa fa-caret-down"></i></a> - <div class="collapse" id="clustering"> - <ul class="nav sidebar-nav"> - <li> <a href="/tutorials/map-reduce/clustering/20newsgroups.html">Twenty Newsgroups Example</a></li> - <li> <a href="/tutorials/map-reduce/clustering/canopy-commandline.html">Canopy Clustering from the Commandline</a></li> - <li> <a href="/tutorials/map-reduce/clustering/clustering-of-synthetic-control-data.html">Clustering of Synthetic Control Data</a></li> - <li> <a href="/tutorials/map-reduce/clustering/clustering-seinfeld-episodes.html">Clustering of Seinfeld Episodes</a></li> - <li> <a href="/tutorials/map-reduce/clustering/clusteringyourdata.html">Clustering Your Data</a></li> - <li> <a href="/tutorials/map-reduce/clustering/fuzzy-k-means-commandline.html">Fuzzy K-Means from the Commandline</a></li> - <li> <a href="/tutorials/map-reduce/clustering/k-means-commandline.html">K-Means from the Commandline</a></li> - <li> <a href="/tutorials/map-reduce/clustering/lda-commandline.html">LDA from the Commandline</a></li> - <li> <a href="/tutorials/map-reduce/clustering/viewing-results.html">Viewing Results</a></li> - <li> <a href="/tutorials/map-reduce/clustering/visualizing-sample-clusters.html">Visualizing Sample Clusters</a></li> - </ul> - </div> - <a href="#misc" class="list-group-item list-group-item-success" data-toggle="collapse" data-parent="#MrTutorialMenu"><b>Miscelaneous</b><i class="fa fa-caret-down"></i></a> - <div class="collapse" id="misc"> - <ul class="nav sidebar-nav"> - <li> <a href="/tutorials/map-reduce/misc/mr---map-reduce.html">MR Map-Reduce</a></li> - <li> <a href="/tutorials/map-reduce/misc/parallel-frequent-pattern-mining.html">Parallel Frequent Pattern Mining</a></li> - <li> <a href="/tutorials/map-reduce/misc/using-mahout-with-python-via-jpype.html">Using Mahout (Map Reduce) with Python via Jpype</a></li> - </ul> - </div> - </div> -</div> + + <!--</ul>--> + </div><!-- /.navbar-collapse --> </div> +</nav> + + <div class="container mt-5 pb-4"> - <div class="col-xs-8"> - <div class="page-header"> - <h1>(Deprecated) Wikipedia XML parser and Naive Bayes Example </h1> - </div> - <h1 id="wikipedia-xml-parser-and-naive-bayes-classifier-example">Wikipedia XML parser and Naive Bayes Classifier Example</h1> + <div class="row"> + + <div class="col-lg-8"> + <h1 id="wikipedia-xml-parser-and-naive-bayes-classifier-example">Wikipedia XML parser and Naive Bayes Classifier Example</h1> <h2 id="introduction">Introduction</h2> <p>Mahout has an <a href="https://github.com/apache/mahout/blob/master/examples/bin/classify-wikipedia.sh">example script</a> [1] which will download a recent XML dump of the (entire if desired) <a href="http://dumps.wikimedia.org/enwiki/latest/">English Wikipedia database</a>. After running the classification script, you can use the <a href="https://github.com/apache/mahout/blob/master/examples/bin/spark-document-classifier.mscala">document classification script</a> from the Mahout <a href="http://mahout.apache.org/users/sparkbindings/play-with-shell.html">spark-shell</a> to vectorize and classify text from outside of the training and testing corpus using a modle built on the Wikipedia dataset.</p> @@ -286,23 +172,24 @@ <h2 id="oververview">Oververview</h2> -<p>Tou run the example simply execute the <code>$MAHOUT_HOME/examples/bin/classify-wikipedia.sh</code> script.</p> +<p>Tou run the example simply execute the <code class="highlighter-rouge">$MAHOUT_HOME/examples/bin/classify-wikipedia.sh</code> script.</p> <p>By defult the script is set to run on a medium sized Wikipedia XML dump. To run on the full set (the entire english Wikipedia) you can change the download by commenting out line 78, and uncommenting line 80 of <a href="https://github.com/apache/mahout/blob/master/examples/bin/classify-wikipedia.sh">classify-wikipedia.sh</a> [1]. However this is not recommended unless you have the resources to do so. <em>Be sure to clean your work directory when changing datasets- option (3).</em></p> -<p>The step by step process for Creating a Naive Bayes Classifier for the Wikipedia XML dump is very similar to that for <a href="http://mahout.apache.org/users/classification/twenty-newsgroups.html">creating a 20 Newsgroups Classifier</a> [4]. The only difference being that instead of running <code>$mahout seqdirectory</code> on the unzipped 20 Newsgroups file, youâll run <code>$mahout seqwiki</code> on the unzipped Wikipedia xml dump.</p> +<p>The step by step process for Creating a Naive Bayes Classifier for the Wikipedia XML dump is very similar to that for <a href="http://mahout.apache.org/users/classification/twenty-newsgroups.html">creating a 20 Newsgroups Classifier</a> [4]. The only difference being that instead of running <code class="highlighter-rouge">$mahout seqdirectory</code> on the unzipped 20 Newsgroups file, youâll run <code class="highlighter-rouge">$mahout seqwiki</code> on the unzipped Wikipedia xml dump.</p> -<pre><code>$ mahout seqwiki +<div class="highlighter-rouge"><pre class="highlight"><code>$ mahout seqwiki </code></pre> +</div> -<p>The above command launches <code>WikipediaToSequenceFile.java</code> which accepts a text file of categories [3] and starts an MR job to parse the each document in the XML file. This process will seek to extract documents with a wikipedia category tag which (exactly, if the <code>-exactMatchOnly</code> option is set) matches a line in the category file. If no match is found and the <code>-all</code> option is set, the document will be dumped into an âunknownâ category. The documents will then be written out as a <code><Text,Text></code> sequence file of the form (K:/category/document_title , V: document).</p> +<p>The above command launches <code class="highlighter-rouge">WikipediaToSequenceFile.java</code> which accepts a text file of categories [3] and starts an MR job to parse the each document in the XML file. This process will seek to extract documents with a wikipedia category tag which (exactly, if the <code class="highlighter-rouge">-exactMatchOnly</code> option is set) matches a line in the category file. If no match is found and the <code class="highlighter-rouge">-all</code> option is set, the document will be dumped into an âunknownâ category. The documents will then be written out as a <code class="highlighter-rouge"><Text,Text></code> sequence file of the form (K:/category/document_title , V: document).</p> <p>There are 3 different example category files available to in the /examples/src/test/resources directory: country.txt, country10.txt and country2.txt. You can edit these categories to extract a different corpus from the Wikipedia dataset.</p> -<p>The CLI options for <code>seqwiki</code> are as follows:</p> +<p>The CLI options for <code class="highlighter-rouge">seqwiki</code> are as follows:</p> -<pre><code>--input (-i) input pathname String +<div class="highlighter-rouge"><pre class="highlight"><code>--input (-i) input pathname String --output (-o) the output pathname String --categories (-c) the file containing the Wikipedia categories --exactMatchOnly (-e) if set, then the Wikipedia category must match @@ -310,8 +197,9 @@ directory: country.txt, country10.txt and country2.txt. You can edit these cat --all (-all) if set select all categories --removeLabels (-rl) if set, remove [[Category:labels]] from document text after extracting label. </code></pre> +</div> -<p>After <code>seqwiki</code>, the script runs <code>seq2sparse</code>, <code>split</code>, <code>trainnb</code> and <code>testnb</code> as in the <a href="http://mahout.apache.org/users/classification/twenty-newsgroups.html">step by step 20newsgroups example</a>. When all of the jobs have finished, a confusion matrix will be displayed.</p> +<p>After <code class="highlighter-rouge">seqwiki</code>, the script runs <code class="highlighter-rouge">seq2sparse</code>, <code class="highlighter-rouge">split</code>, <code class="highlighter-rouge">trainnb</code> and <code class="highlighter-rouge">testnb</code> as in the <a href="http://mahout.apache.org/users/classification/twenty-newsgroups.html">step by step 20newsgroups example</a>. When all of the jobs have finished, a confusion matrix will be displayed.</p> <p>#Resourcese</p> @@ -331,32 +219,25 @@ directory: country.txt, country10.txt and country2.txt. You can edit these cat </div> -</div> - </div> - -</div> - -<div id="footer"> - <div class="container"> - <p>© 2017 Apache Mahout - with help from <a href="http://jekyllbootstrap.com" target="_blank" title="The Definitive Jekyll Blogging Framework">Jekyll Bootstrap</a> - and <a href="http://getbootstrap.com" target="_blank">Bootstrap</a> - </p> </div> -</div> - - +</div> + <footer class="footer bg-light"> + <div class="container text-center small"> + Copyright © 2014-2017 The Apache Software Foundation, Licensed under the Apache License, Version 2.0. + </div> +</footer> + <script src="/assets/vendor/jquery/jquery-slim.min.js"></script> + <script src="/assets/vendor/popper/popper.min.js"></script> + <script src="/assets/vendor/bootstrap/js/bootstrap.min.js"></script> + <script src="/assets/header.js"></script> + <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-AMS-MML_HTMLorMML" type="text/javascript"></script> -<!-- Latest compiled and minified JavaScript, requires jQuery 1.x (2.x not supported in IE8) --> -<!-- Placed at the end of the document so the pages load faster --> -<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.10.2/jquery.min.js"></script> -<script src="/assets/themes/mahout3/js/bootstrap.min.js"></script> </body> -</html> +</html>
