http://git-wip-us.apache.org/repos/asf/mahout/blob/0e718ec9/website/oldsite/_site/general/faq.html
----------------------------------------------------------------------
diff --git a/website/oldsite/_site/general/faq.html 
b/website/oldsite/_site/general/faq.html
new file mode 100644
index 0000000..10109c5
--- /dev/null
+++ b/website/oldsite/_site/general/faq.html
@@ -0,0 +1,406 @@
+
+
+<!DOCTYPE html>
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+-->
+
+<html xmlns="http://www.w3.org/1999/xhtml"; xml:lang="en" lang="en"><head><meta 
http-equiv="Content-Type" content="text/html; charset=UTF-8">
+  <title>Apache Mahout: Scalable machine learning and data mining</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+  <meta name="Distribution" content="Global">
+  <meta name="Robots" content="index,follow">
+  <meta name="keywords" content="apache, apache hadoop, apache lucene,
+        business data mining, cluster analysis,
+        collaborative filtering, data extraction, data filtering, data 
framework, data integration,
+        data matching, data mining, data mining algorithms, data mining 
analysis, data mining data,
+        data mining introduction, data mining software,
+        data mining techniques, data representation, data set, datamining,
+        feature extraction, fuzzy k means, genetic algorithm, hadoop,
+        hierarchical clustering, high dimensional, introduction to data 
mining, kmeans,
+        knowledge discovery, learning approach, learning approaches, learning 
methods,
+        learning techniques, lucene, machine learning, machine translation, 
mahout apache,
+        mahout taste, map reduce hadoop, mining data, mining methods, naive 
bayes,
+        natural language processing,
+        supervised, text mining, time series data, unsupervised, web data 
mining">
+  <link rel="shortcut icon" type="image/x-icon" 
href="https://mahout.apache.org/images/favicon.ico";>
+  <!--<script type="text/javascript" src="/js/prototype.js"></script>-->
+  <script type="text/javascript" 
src="https://ajax.googleapis.com/ajax/libs/prototype/1.7.2.0/prototype.js";></script>
+  <script type="text/javascript" 
src="/assets/themes/mahout-retro/js/effects.js"></script>
+  <script type="text/javascript" 
src="/assets/themes/mahout-retro/js/search.js"></script>
+  <script type="text/javascript" 
src="/assets/themes/mahout-retro/js/slides.js"></script>
+
+  <link href="/assets/themes/mahout-retro/css/bootstrap.min.css" 
rel="stylesheet" media="screen">
+  <link href="/assets/themes/mahout-retro/css/bootstrap-responsive.css" 
rel="stylesheet">
+  <link rel="stylesheet" href="/assets/themes/mahout-retro/css/global.css" 
type="text/css">
+
+  <!-- mathJax stuff -- use `\(...\)` for inline style math in markdown -->
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    tex2jax: {
+      skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
+    }
+  });
+  MathJax.Hub.Queue(function() {
+    var all = MathJax.Hub.getAllJax(), i;
+    for(i = 0; i < all.length; i += 1) {
+      all[i].SourceElement().parentNode.className += ' has-jax';
+    }
+  });
+  </script>
+  <script type="text/javascript">
+    var mathjax = document.createElement('script'); 
+    mathjax.type = 'text/javascript'; 
+    mathjax.async = true;
+
+    mathjax.src = ('https:' == document.location.protocol) ?
+        
'https://c328740.ssl.cf1.rackcdn.com/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'
 : 
+        
'http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
+       
+         var s = document.getElementsByTagName('script')[0]; 
+    s.parentNode.insertBefore(mathjax, s);
+  </script>
+</head>
+
+<body id="home" data-twttr-rendered="true">
+  <div id="wrap">
+   <div id="header">
+    <div id="logo"><a href="/"><img src="/assets/img/mahout-logo-brudman.png" 
alt="Logos for Mahout and Apache Software Foundation" /></a></div>
+  <div id="search">
+    <form id="search-form" action="http://www.google.com/search"; method="get" 
class="navbar-search pull-right">    
+      <input value="http://mahout.apache.org"; name="sitesearch" type="hidden">
+      <input class="search-query" name="q" id="query" type="text">
+      <input id="submission" type="image" src="/assets/img/mahout-lupe.png" 
alt="Search" />
+    </form>
+  </div>
+ 
+    <div class="navbar navbar-inverse" 
style="position:absolute;top:133px;padding-right:0px;padding-left:0px;">
+      <div class="navbar-inner" style="border: none; background: #999; border: 
none; border-radius: 0px;">
+        <div class="container">
+          <button type="button" class="btn btn-navbar" data-toggle="collapse" 
data-target=".nav-collapse">
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+          </button>
+          <!-- <a class="brand" href="#">Apache Community Development 
Project</a> -->
+            <!--<div class="nav-collapse collapse">-->
+<div class="collapse navbar-collapse" id="main-navbar">
+    <ul class="nav navbar-nav">
+        <!-- <li><a href="/">Home</a></li> -->
+        <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">General<b class="caret"></b></a>
+            <ul class="dropdown-menu">
+                <li><a href="/general/downloads.html">Downloads</a>
+                <li><a href="/general/who-we-are.html">Who we are</a>
+                <li><a 
href="/general/mailing-lists,-irc-and-archives.html">Mailing Lists</a>
+                <li><a href="/general/release-notes.html">Release Notes</a>
+                <li><a href="/general/books-tutorials-and-talks.html">Books, 
Tutorials, Talks</a></li>
+                <li><a href="/general/powered-by-mahout.html">Powered By 
Mahout</a>
+                <li><a href="/general/professional-support.html">Professional 
Support</a>
+                <li class="divider"></li>
+                <li class="nav-header">Resources</li>
+                <li><a href="/general/reference-reading.html">Reference 
Reading</a>
+                <li><a href="/general/faq.html">FAQ</a>
+                <li class="divider"></li>
+                <li class="nav-header">Legal</li>
+                <li><a href="http://www.apache.org/licenses/";>License</a></li>
+                <li><a href="http://www.apache.org/security/";>Security</a></li>
+                <li><a href="/general/privacy-policy.html">Privacy Policy</a>
+            </ul>
+        </li>
+        <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Developers<b class="caret"></b></a>
+            <ul class="dropdown-menu">
+                <li><a href="/developers/developer-resources.html">Developer 
resources</a></li>
+                <li><a href="/developers/version-control.html">Version 
control</a></li>
+                <li><a href="/developers/buildingmahout.html">Build from 
source</a></li>
+                <li><a href="/developers/issue-tracker.html">Issue 
tracker</a></li>
+                <li><a href="https://builds.apache.org/job/Mahout-Quality/"; 
target="_blank">Code quality reports</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Contributions</li>
+                <li><a href="/developers/how-to-contribute.html">How to 
contribute</a></li>
+                <li><a href="/developers/how-to-become-a-committer.html">How 
to become a committer</a></li>
+                <li><a href="/developers/gsoc.html">GSoC</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">For committers</li>
+                <li><a href="/developers/how-to-update-the-website.html">How 
to update the website</a></li>
+                <li><a href="/developers/patch-check-list.html">Patch check 
list</a></li>
+                <li><a href="/developers/github.html">Handling Github 
PRs</a></li>
+                <li><a href="/developers/how-to-release.html">How to 
release</a></li>
+                <li><a href="/developers/thirdparty-dependencies.html">Third 
party dependencies</a></li>
+            </ul>
+        </li>
+        <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Mahout-Samsara<b class="caret"></b></a>
+            <ul class="dropdown-menu">
+                <li><a href="/users/sparkbindings/home.html">Scala &amp; Spark 
Bindings Overview</a></li>
+                <li><a href="/users/sparkbindings/faq.html">FAQ</a></li>
+                <li><a 
href="/users/flinkbindings/playing-with-samsara-flink.html">Flink Bindings 
Overview</a></li>
+                <li class="nav-header">Engines</li>
+                <li><a href="/users/sparkbindings/home.html">Spark</a></li>
+                <li><a 
href="/users/environment/h2o-internals.html">H2O</a></li>
+                <li><a 
href="/users/flinkbindings/flink-internals.html">Flink</a></li>
+                <li class="nav-header">References</li>
+                <li><a 
href="/users/environment/in-core-reference.html">In-Core Algebraic DSL 
Reference</a></li>
+                <li><a 
href="/users/environment/out-of-core-reference.html">Distributed Algebraic DSL 
Reference</a></li>
+                <li class="nav-header">Tutorials</li>
+                <li><a 
href="/users/sparkbindings/play-with-shell.html">Playing with Mahout's Spark 
Shell</a></li>
+                <li><a href="/users/environment/how-to-build-an-app.html">How 
to build an app</a></li>
+                <li><a 
href="/users/environment/classify-a-doc-from-the-shell.html">Building a text 
classifier in Mahout's Spark Shell</a></li>
+            </ul>
+        </li>
+        <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Algorithms<b class="caret"></b></a>
+            <ul class="dropdown-menu">
+                <li><a href="/users/basics/algorithms.html">List of 
algorithms</a>
+                <li class="nav-header">Distributed Matrix Decomposition</li>
+                <li><a href="/users/algorithms/d-qr.html">Cholesky QR</a></li>
+                <li><a href="/users/algorithms/d-ssvd.html">SSVD</a></li>
+                <li><a href="/users/algorithms/d-als.html">Distributed 
ALS</a></li>
+                <li><a href="/users/algorithms/d-spca.html">SPCA</a></li>
+                <li class="nav-header">Recommendations</li>
+                <li><a 
href="/users/algorithms/recommender-overview.html">Recommender Overview</a></li>
+                <li><a 
href="/users/algorithms/intro-cooccurrence-spark.html">Intro to 
cooccurrence-based<br/> recommendations with Spark</a></li>
+                <li class="nav-header">Classification</li>
+                <li><a href="/users/algorithms/spark-naive-bayes.html">Spark 
Naive Bayes</a></li>
+            </ul>
+        </li>
+        <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">MapReduce Basics<b class="caret"></b></a>
+            <ul class="dropdown-menu">
+                <li><a href="/users/basics/algorithms.html">List of 
algorithms</a>
+                <li><a href="/users/basics/quickstart.html">Overview</a>
+                <li class="divider"></li>
+                <li class="nav-header">Working with text</li>
+                <li><a 
href="/users/basics/creating-vectors-from-text.html">Creating vectors from 
text</a>
+                <li><a href="/users/basics/collocations.html">Collocations</a>
+                <li class="divider"></li>
+                <li class="nav-header">Dimensionality reduction</li>
+                <li><a 
href="/users/dim-reduction/dimensional-reduction.html">Singular Value 
Decomposition</a></li>
+                <li><a href="/users/dim-reduction/ssvd.html">Stochastic 
SVD</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Topic Models</li>
+                <li><a 
href="/users/clustering/latent-dirichlet-allocation.html">Latent Dirichlet 
Allocation</a></li>
+            </ul>
+        </li>
+        <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Mahout MapReduce<b class="caret"></b></a>
+            <ul class="dropdown-menu">
+                <li class="nav-header">Classification</li>
+                <li><a href="/users/classification/bayesian.html">Naive 
Bayes</a></li>
+                <li><a 
href="/users/classification/hidden-markov-models.html">Hidden Markov 
Models</a></li>
+                <li><a 
href="/users/classification/logistic-regression.html">Logistic Regression 
(Single Machine)</a></li>
+                <li><a 
href="/users/classification/partial-implementation.html">Random Forest</a></li>
+                <li class="nav-header">Classification Examples</li>
+                <li><a 
href="/users/classification/breiman-example.html">Breiman example</a></li>
+                <li><a href="/users/classification/twenty-newsgroups.html">20 
newsgroups example</a></li>
+                <li><a 
href="/users/classification/bankmarketing-example.html">SGD classifier bank 
marketing</a></li>
+                <li><a 
href="/users/classification/wikipedia-classifier-example.html">Wikipedia XML 
parser and classifier</a></li>
+                <li class="nav-header">Clustering</li>
+                <li><a 
href="/users/clustering/k-means-clustering.html">k-Means</a></li>
+                <li><a 
href="/users/clustering/canopy-clustering.html">Canopy</a></li>
+                <li><a href="/users/clustering/fuzzy-k-means.html">Fuzzy 
k-Means</a></li>
+                <li><a 
href="/users/clustering/streaming-k-means.html">Streaming KMeans</a></li>
+                <li><a 
href="/users/clustering/spectral-clustering.html">Spectral Clustering</a></li>
+                <li class="nav-header">Clustering Commandline usage</li>
+                <li><a 
href="/users/clustering/k-means-commandline.html">Options for k-Means</a></li>
+                <li><a 
href="/users/clustering/canopy-commandline.html">Options for Canopy</a></li>
+                <li><a 
href="/users/clustering/fuzzy-k-means-commandline.html">Options for Fuzzy 
k-Means</a></li>
+                <li class="nav-header">Clustering Examples</li>
+                <li><a 
href="/users/clustering/clustering-of-synthetic-control-data.html">Synthetic 
data</a></li>
+                <li class="nav-header">Cluster Post processing</li>
+                <li><a href="/users/clustering/cluster-dumper.html">Cluster 
Dumper tool</a></li>
+                <li><a 
href="/users/clustering/visualizing-sample-clusters.html">Cluster 
visualisation</a></li>
+                <li class="nav-header">Recommendations</li>
+                <li><a 
href="/users/recommender/recommender-first-timer-faq.html">First Timer 
FAQ</a></li>
+                <li><a href="/users/recommender/userbased-5-minutes.html">A 
user-based recommender <br/>in 5 minutes</a></li>
+                <li><a 
href="/users/recommender/matrix-factorization.html">Matrix 
factorization-based<br/> recommenders</a></li>
+                <li><a 
href="/users/recommender/recommender-documentation.html">Overview</a></li>
+                <li><a 
href="/users/recommender/intro-itembased-hadoop.html">Intro to item-based 
recommendations<br/> with Hadoop</a></li>
+                <li><a href="/users/recommender/intro-als-hadoop.html">Intro 
to ALS recommendations<br/> with Hadoop</a></li>
+            </ul>
+        </li>
+        <!--  <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Recommendations<b class="caret"></b></a>
+          <ul class="dropdown-menu">
+
+          </ul> -->
+        </li>
+    </ul>
+</div><!--/.nav-collapse -->
+        </div>
+      </div>
+    </div>
+
+</div>
+
+ <div id="sidebar">
+  <div id="sidebar-wrap">
+    <h2>Twitter</h2>
+       <ul class="sidemenu">
+               <li>
+<a class="twitter-timeline" href="https://twitter.com/ApacheMahout"; 
data-widget-id="422861673444028416">Tweets by @ApacheMahout</a>
+<script>!function(d,s,id){var 
js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+"://platform.twitter.com/widgets.js";fjs.parentNode.insertBefore(js,fjs);}}(document,"script","twitter-wjs");</script>
+</li>
+       </ul>
+    <h2>Apache Software Foundation</h2>
+    <ul class="sidemenu">
+      <li><a href="http://www.apache.org/foundation/how-it-works.html";>How the 
ASF works</a></li>
+      <li><a href="http://www.apache.org/foundation/getinvolved.html";>Get 
Involved</a></li>
+      <li><a href="http://www.apache.org/dev/";>Developer Resources</a></li>
+      <li><a 
href="http://www.apache.org/foundation/sponsorship.html";>Sponsorship</a></li>
+      <li><a 
href="http://www.apache.org/foundation/thanks.html";>Thanks</a></li>
+    </ul>
+    <h2>Related Projects</h2>
+    <ul class="sidemenu">
+      <li><a href="http://lucene.apache.org/";>Apache Lucene</a></li>
+      <li><a href="http://hadoop.apache.org/";>Apache Hadoop</a></li>
+      <li><a href="http://bigtop.apache.org/";>Apache Bigtop</a></li>
+      <li><a href="http://spark.apache.org/";>Apache Spark</a></li>
+         <li><a href="http://flink.apache.org/";>Apache Flink</a></li>
+    </ul>
+  </div>
+</div>
+
+  <div id="content-wrap" class="clearfix">
+   <div id="main">
+
+    <h1 id="the-official-mahout-faq">The Official Mahout FAQ</h1>
+
+<p><em>General</em></p>
+
+<ol>
+  <li><a href="#whatis">What is Apache Mahout?</a></li>
+  <li><a href="#mean">What does the name mean?</a></li>
+  <li><a href="#pronounce">How is the name pronounced?</a></li>
+  <li><a href="#historical">Where can I find the origins of the Mahout 
project?</a></li>
+  <li><a href="#downloadlogo">Where can I download the Mahout logo?</a></li>
+  <li><a href="#presentations">Where can I download Mahout slide 
presentations?</a></li>
+</ol>
+
+<p><em>Algorithms</em></p>
+
+<ol>
+  <li><a href="#algos">What algorithms are implemented in Mahout?</a></li>
+  <li><a href="#todo">What algorithms are missing from Mahout?</a></li>
+  <li><a href="#hadoop">Do I need Hadoop to run Mahout?</a></li>
+</ol>
+
+<p><em>Hadoop specific questions</em></p>
+
+<ol>
+  <li><a href="#split">Mahout just won’t run in parallel on my dataset. 
Why?</a></li>
+</ol>
+
+<h1 id="answers"><em>Answers</em></h1>
+
+<h2 id="general">General</h2>
+
+<p><a name="whatis"></a></p>
+<h4 id="what-is-apache-mahout">What is Apache Mahout?</h4>
+
+<p>Apache Mahout is a suite of machine learning libraries designed to be
+scalable and robust</p>
+
+<p><a name="mean"></a></p>
+<h4 id="what-does-the-name-mean">What does the name mean?</h4>
+
+<p>The name <a href="http://en.wikipedia.org/wiki/Mahout";>Mahout</a>
+ was original chosen for it’s association with the <a 
href="http://hadoop.apache.org";>Apache Hadoop</a>
+ project.  A Mahout is a person who drives an elephant (hint: Hadoop’s logo
+is an elephant).  We just wanted a name that complemented Hadoop but we see
+our project as a good driver of Hadoop in the sense that we will be using
+and testing it.  We are not, however, implying that we are controlling
+Hadoop’s development.</p>
+
+<p>Prior to coming to the ASF, those of us working on the project plan voted 
between <a href="http://en.wikipedia.org/wiki/Howdah";>Howdah</a> – the 
carriage on top of an elephant and Mahout.</p>
+
+<p><a name="historical"></a></p>
+<h4 id="where-can-i-find-the-origins-of-the-mahout-project">Where can I find 
the origins of the Mahout project?</h4>
+
+<p>See <a 
href="http://web.archive.org/web/20080101233917/http://ml-site.grantingersoll.com/index.php?title=Main_Page";>http://ml-site.grantingersoll.com</a>
+ for old wiki and mailing list archives (all read-only)</p>
+
+<p>Mahout was started by <a 
href="http://web.archive.org/web/20071228055210/http://ml-site.grantingersoll.com/index.php?title=Main_Page";
 class="external-link" rel="nofollow">Isabel Drost, Grant Ingersoll and Karl 
Wettin</a>. It <a 
href="http://web.archive.org/web/20080201093120/http://lucene.apache.org/#22+January+2008+-+Lucene+PMC+Approves+Mahout+Machine+Learning+Project";
 class="external-link" rel="nofollow">started</a> as part of the <a 
href="http://lucene.apache.org"; class="external-link" rel="nofollow">Lucene</a> 
project (see the <a 
href="http://web.archive.org/web/20080102151102/http://ml-site.grantingersoll.com/index.php?title=Incubator_proposal";
 class="external-link" rel="nofollow">original proposal</a>) and went on to 
become a top level project in April of 2010.&lt;/p&gt;&lt;p style="text-align: 
left;"&gt;The original goal was to implement all 10 algorithms from Andrew 
Ng’s paper "<a 
href="http://ai.stanford.edu/~ang/papers/nips06-mapreducemulticore.pdf"; 
class="external-li
 nk" rel="nofollow">Map-Reduce for Machine Learning on 
Multicore</a>"&lt;/p&gt;</p>
+
+<p><a name="pronounce"></a></p>
+<h4 id="how-is-the-name-pronounced">How is the name pronounced?</h4>
+
+<p>There are some disagreements about how to pronounce the name. Webster’s 
has it as muh-hout (as in <a 
href="http://dictionary.reference.com/browse/mahout";>“out”</a>), but the 
Sanskrit/Hindi origins pronounce it as “muh-hoot”. The second pronunciation 
suggests a nice pun on the Hebrew word מהות meaning “essence or 
truth”.</p>
+
+<p><a name="downloadlogo"></a></p>
+<h4 id="where-can-i-download-the-mahout-logo">Where can I download the Mahout 
logo?</h4>
+
+<p>See <a 
href="https://issues.apache.org/jira/browse/MAHOUT-335";>MAHOUT-335</a></p>
+
+<p><a name="presentations"></a></p>
+<h4 id="where-can-i-download-mahout-slide-presentations">Where can I download 
Mahout slide presentations?</h4>
+
+<p>The <a 
href="https://mahout.apache.org/general/books-tutorials-and-talks.html";>Books, 
Tutorials and Talks</a>
+ page contains an overview of a wide variety of presentations with links to 
slides where available.</p>
+
+<h2 id="algorithms">Algorithms</h2>
+
+<p><a name="algos"></a></p>
+<h4 id="what-algorithms-are-implemented-in-mahout">What algorithms are 
implemented in Mahout?</h4>
+
+<p>We are interested in a wide variety of machine learning algorithms. Many of
+which are already implemented in Mahout. You can find a list <a 
href="https://mahout.apache.org/users/basics/algorithms.html";>here</a>.</p>
+
+<p><a name="todo"></a></p>
+<h4 id="what-algorithms-are-missing-from-mahout">What algorithms are missing 
from Mahout?</h4>
+
+<p>There are many machine learning algorithms that we would like to have in
+Mahout. If you have an algorithm or an improvement to an algorithm that you 
would
+like to implement, start a discussion on our <a 
href="https://mahout.apache.org/general/mailing-lists,-irc-and-archives.html";>mailing
 list</a>.</p>
+
+<p><a name="hadoop"></a></p>
+<h4 id="do-i-need-hadoop-to-use-mahout">Do I need Hadoop to use Mahout?</h4>
+
+<p>There is a number of algorithm implementations that require no Hadoop 
dependencies whatsoever, consult the <a 
href="https://mahout.apache.org/users/basics/algorithms.html";>algorithms 
list</a>. In the future, we might provide more algorithm implementations on 
platforms more suitable for machine learning such as <a 
href="http://spark.apache.org";>Apache Spark</a></p>
+
+<h2 id="hadoop-specific-questions">Hadoop specific questions</h2>
+<p><a name="split"></a></p>
+<h4 id="mahout-just-wont-run-in-parallel-on-my-dataset-why">Mahout just 
won’t run in parallel on my dataset. Why?</h4>
+
+<p>If you are running training on a Hadoop cluster keep in mind that the 
number of mappers started is governed by the size of the input data and the 
configured split/block size of your cluster. As a rule of thumb,
+anything below 100MB in size won’t be split by default.</p>
+
+   </div>
+  </div>     
+</div> 
+  <footer class="footer" align="center">
+    <div class="container">
+      <p>
+        Copyright &copy; 2014-2016 The Apache Software Foundation, Licensed 
under
+        the <a href="http://www.apache.org/licenses/LICENSE-2.0";>Apache 
License, Version 2.0</a>.
+        <br />
+                 Apache Mahout, Mahout, Apache, the Apache feather logo, and 
the elephant rider logo are either registered trademarks or trademarks of <a 
href="http://www.apache.org/foundation/marks/";>The Apache Software 
Foundation</a> in the United States and other countries.
+      </p>
+    </div>
+  </footer>
+  
+  <script src="/assets/themes/mahout-retro/js/jquery-1.9.1.min.js"></script>
+  <script src="/assets/themes/mahout-retro/js/bootstrap.min.js"></script>
+  <script>
+    (function() {
+      var cx = '012254517474945470291:vhsfv7eokdc';
+      var gcse = document.createElement('script');
+      gcse.type = 'text/javascript';
+      gcse.async = true;
+      gcse.src = (document.location.protocol == 'https:' ? 'https:' : 'http:') 
+
+          '//www.google.com/cse/cse.js?cx=' + cx;
+      var s = document.getElementsByTagName('script')[0];
+      s.parentNode.insertBefore(gcse, s);
+    })();
+  </script>
+</body>
+</html>
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/0e718ec9/website/oldsite/_site/general/glossary.html
----------------------------------------------------------------------
diff --git a/website/oldsite/_site/general/glossary.html 
b/website/oldsite/_site/general/glossary.html
new file mode 100644
index 0000000..547a698
--- /dev/null
+++ b/website/oldsite/_site/general/glossary.html
@@ -0,0 +1,318 @@
+
+
+<!DOCTYPE html>
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+-->
+
+<html xmlns="http://www.w3.org/1999/xhtml"; xml:lang="en" lang="en"><head><meta 
http-equiv="Content-Type" content="text/html; charset=UTF-8">
+  <title>Apache Mahout: Scalable machine learning and data mining</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+  <meta name="Distribution" content="Global">
+  <meta name="Robots" content="index,follow">
+  <meta name="keywords" content="apache, apache hadoop, apache lucene,
+        business data mining, cluster analysis,
+        collaborative filtering, data extraction, data filtering, data 
framework, data integration,
+        data matching, data mining, data mining algorithms, data mining 
analysis, data mining data,
+        data mining introduction, data mining software,
+        data mining techniques, data representation, data set, datamining,
+        feature extraction, fuzzy k means, genetic algorithm, hadoop,
+        hierarchical clustering, high dimensional, introduction to data 
mining, kmeans,
+        knowledge discovery, learning approach, learning approaches, learning 
methods,
+        learning techniques, lucene, machine learning, machine translation, 
mahout apache,
+        mahout taste, map reduce hadoop, mining data, mining methods, naive 
bayes,
+        natural language processing,
+        supervised, text mining, time series data, unsupervised, web data 
mining">
+  <link rel="shortcut icon" type="image/x-icon" 
href="https://mahout.apache.org/images/favicon.ico";>
+  <!--<script type="text/javascript" src="/js/prototype.js"></script>-->
+  <script type="text/javascript" 
src="https://ajax.googleapis.com/ajax/libs/prototype/1.7.2.0/prototype.js";></script>
+  <script type="text/javascript" 
src="/assets/themes/mahout-retro/js/effects.js"></script>
+  <script type="text/javascript" 
src="/assets/themes/mahout-retro/js/search.js"></script>
+  <script type="text/javascript" 
src="/assets/themes/mahout-retro/js/slides.js"></script>
+
+  <link href="/assets/themes/mahout-retro/css/bootstrap.min.css" 
rel="stylesheet" media="screen">
+  <link href="/assets/themes/mahout-retro/css/bootstrap-responsive.css" 
rel="stylesheet">
+  <link rel="stylesheet" href="/assets/themes/mahout-retro/css/global.css" 
type="text/css">
+
+  <!-- mathJax stuff -- use `\(...\)` for inline style math in markdown -->
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    tex2jax: {
+      skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
+    }
+  });
+  MathJax.Hub.Queue(function() {
+    var all = MathJax.Hub.getAllJax(), i;
+    for(i = 0; i < all.length; i += 1) {
+      all[i].SourceElement().parentNode.className += ' has-jax';
+    }
+  });
+  </script>
+  <script type="text/javascript">
+    var mathjax = document.createElement('script'); 
+    mathjax.type = 'text/javascript'; 
+    mathjax.async = true;
+
+    mathjax.src = ('https:' == document.location.protocol) ?
+        
'https://c328740.ssl.cf1.rackcdn.com/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'
 : 
+        
'http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
+       
+         var s = document.getElementsByTagName('script')[0]; 
+    s.parentNode.insertBefore(mathjax, s);
+  </script>
+</head>
+
+<body id="home" data-twttr-rendered="true">
+  <div id="wrap">
+   <div id="header">
+    <div id="logo"><a href="/"><img src="/assets/img/mahout-logo-brudman.png" 
alt="Logos for Mahout and Apache Software Foundation" /></a></div>
+  <div id="search">
+    <form id="search-form" action="http://www.google.com/search"; method="get" 
class="navbar-search pull-right">    
+      <input value="http://mahout.apache.org"; name="sitesearch" type="hidden">
+      <input class="search-query" name="q" id="query" type="text">
+      <input id="submission" type="image" src="/assets/img/mahout-lupe.png" 
alt="Search" />
+    </form>
+  </div>
+ 
+    <div class="navbar navbar-inverse" 
style="position:absolute;top:133px;padding-right:0px;padding-left:0px;">
+      <div class="navbar-inner" style="border: none; background: #999; border: 
none; border-radius: 0px;">
+        <div class="container">
+          <button type="button" class="btn btn-navbar" data-toggle="collapse" 
data-target=".nav-collapse">
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+          </button>
+          <!-- <a class="brand" href="#">Apache Community Development 
Project</a> -->
+            <!--<div class="nav-collapse collapse">-->
+<div class="collapse navbar-collapse" id="main-navbar">
+    <ul class="nav navbar-nav">
+        <!-- <li><a href="/">Home</a></li> -->
+        <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">General<b class="caret"></b></a>
+            <ul class="dropdown-menu">
+                <li><a href="/general/downloads.html">Downloads</a>
+                <li><a href="/general/who-we-are.html">Who we are</a>
+                <li><a 
href="/general/mailing-lists,-irc-and-archives.html">Mailing Lists</a>
+                <li><a href="/general/release-notes.html">Release Notes</a>
+                <li><a href="/general/books-tutorials-and-talks.html">Books, 
Tutorials, Talks</a></li>
+                <li><a href="/general/powered-by-mahout.html">Powered By 
Mahout</a>
+                <li><a href="/general/professional-support.html">Professional 
Support</a>
+                <li class="divider"></li>
+                <li class="nav-header">Resources</li>
+                <li><a href="/general/reference-reading.html">Reference 
Reading</a>
+                <li><a href="/general/faq.html">FAQ</a>
+                <li class="divider"></li>
+                <li class="nav-header">Legal</li>
+                <li><a href="http://www.apache.org/licenses/";>License</a></li>
+                <li><a href="http://www.apache.org/security/";>Security</a></li>
+                <li><a href="/general/privacy-policy.html">Privacy Policy</a>
+            </ul>
+        </li>
+        <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Developers<b class="caret"></b></a>
+            <ul class="dropdown-menu">
+                <li><a href="/developers/developer-resources.html">Developer 
resources</a></li>
+                <li><a href="/developers/version-control.html">Version 
control</a></li>
+                <li><a href="/developers/buildingmahout.html">Build from 
source</a></li>
+                <li><a href="/developers/issue-tracker.html">Issue 
tracker</a></li>
+                <li><a href="https://builds.apache.org/job/Mahout-Quality/"; 
target="_blank">Code quality reports</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Contributions</li>
+                <li><a href="/developers/how-to-contribute.html">How to 
contribute</a></li>
+                <li><a href="/developers/how-to-become-a-committer.html">How 
to become a committer</a></li>
+                <li><a href="/developers/gsoc.html">GSoC</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">For committers</li>
+                <li><a href="/developers/how-to-update-the-website.html">How 
to update the website</a></li>
+                <li><a href="/developers/patch-check-list.html">Patch check 
list</a></li>
+                <li><a href="/developers/github.html">Handling Github 
PRs</a></li>
+                <li><a href="/developers/how-to-release.html">How to 
release</a></li>
+                <li><a href="/developers/thirdparty-dependencies.html">Third 
party dependencies</a></li>
+            </ul>
+        </li>
+        <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Mahout-Samsara<b class="caret"></b></a>
+            <ul class="dropdown-menu">
+                <li><a href="/users/sparkbindings/home.html">Scala &amp; Spark 
Bindings Overview</a></li>
+                <li><a href="/users/sparkbindings/faq.html">FAQ</a></li>
+                <li><a 
href="/users/flinkbindings/playing-with-samsara-flink.html">Flink Bindings 
Overview</a></li>
+                <li class="nav-header">Engines</li>
+                <li><a href="/users/sparkbindings/home.html">Spark</a></li>
+                <li><a 
href="/users/environment/h2o-internals.html">H2O</a></li>
+                <li><a 
href="/users/flinkbindings/flink-internals.html">Flink</a></li>
+                <li class="nav-header">References</li>
+                <li><a 
href="/users/environment/in-core-reference.html">In-Core Algebraic DSL 
Reference</a></li>
+                <li><a 
href="/users/environment/out-of-core-reference.html">Distributed Algebraic DSL 
Reference</a></li>
+                <li class="nav-header">Tutorials</li>
+                <li><a 
href="/users/sparkbindings/play-with-shell.html">Playing with Mahout's Spark 
Shell</a></li>
+                <li><a href="/users/environment/how-to-build-an-app.html">How 
to build an app</a></li>
+                <li><a 
href="/users/environment/classify-a-doc-from-the-shell.html">Building a text 
classifier in Mahout's Spark Shell</a></li>
+            </ul>
+        </li>
+        <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Algorithms<b class="caret"></b></a>
+            <ul class="dropdown-menu">
+                <li><a href="/users/basics/algorithms.html">List of 
algorithms</a>
+                <li class="nav-header">Distributed Matrix Decomposition</li>
+                <li><a href="/users/algorithms/d-qr.html">Cholesky QR</a></li>
+                <li><a href="/users/algorithms/d-ssvd.html">SSVD</a></li>
+                <li><a href="/users/algorithms/d-als.html">Distributed 
ALS</a></li>
+                <li><a href="/users/algorithms/d-spca.html">SPCA</a></li>
+                <li class="nav-header">Recommendations</li>
+                <li><a 
href="/users/algorithms/recommender-overview.html">Recommender Overview</a></li>
+                <li><a 
href="/users/algorithms/intro-cooccurrence-spark.html">Intro to 
cooccurrence-based<br/> recommendations with Spark</a></li>
+                <li class="nav-header">Classification</li>
+                <li><a href="/users/algorithms/spark-naive-bayes.html">Spark 
Naive Bayes</a></li>
+            </ul>
+        </li>
+        <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">MapReduce Basics<b class="caret"></b></a>
+            <ul class="dropdown-menu">
+                <li><a href="/users/basics/algorithms.html">List of 
algorithms</a>
+                <li><a href="/users/basics/quickstart.html">Overview</a>
+                <li class="divider"></li>
+                <li class="nav-header">Working with text</li>
+                <li><a 
href="/users/basics/creating-vectors-from-text.html">Creating vectors from 
text</a>
+                <li><a href="/users/basics/collocations.html">Collocations</a>
+                <li class="divider"></li>
+                <li class="nav-header">Dimensionality reduction</li>
+                <li><a 
href="/users/dim-reduction/dimensional-reduction.html">Singular Value 
Decomposition</a></li>
+                <li><a href="/users/dim-reduction/ssvd.html">Stochastic 
SVD</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Topic Models</li>
+                <li><a 
href="/users/clustering/latent-dirichlet-allocation.html">Latent Dirichlet 
Allocation</a></li>
+            </ul>
+        </li>
+        <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Mahout MapReduce<b class="caret"></b></a>
+            <ul class="dropdown-menu">
+                <li class="nav-header">Classification</li>
+                <li><a href="/users/classification/bayesian.html">Naive 
Bayes</a></li>
+                <li><a 
href="/users/classification/hidden-markov-models.html">Hidden Markov 
Models</a></li>
+                <li><a 
href="/users/classification/logistic-regression.html">Logistic Regression 
(Single Machine)</a></li>
+                <li><a 
href="/users/classification/partial-implementation.html">Random Forest</a></li>
+                <li class="nav-header">Classification Examples</li>
+                <li><a 
href="/users/classification/breiman-example.html">Breiman example</a></li>
+                <li><a href="/users/classification/twenty-newsgroups.html">20 
newsgroups example</a></li>
+                <li><a 
href="/users/classification/bankmarketing-example.html">SGD classifier bank 
marketing</a></li>
+                <li><a 
href="/users/classification/wikipedia-classifier-example.html">Wikipedia XML 
parser and classifier</a></li>
+                <li class="nav-header">Clustering</li>
+                <li><a 
href="/users/clustering/k-means-clustering.html">k-Means</a></li>
+                <li><a 
href="/users/clustering/canopy-clustering.html">Canopy</a></li>
+                <li><a href="/users/clustering/fuzzy-k-means.html">Fuzzy 
k-Means</a></li>
+                <li><a 
href="/users/clustering/streaming-k-means.html">Streaming KMeans</a></li>
+                <li><a 
href="/users/clustering/spectral-clustering.html">Spectral Clustering</a></li>
+                <li class="nav-header">Clustering Commandline usage</li>
+                <li><a 
href="/users/clustering/k-means-commandline.html">Options for k-Means</a></li>
+                <li><a 
href="/users/clustering/canopy-commandline.html">Options for Canopy</a></li>
+                <li><a 
href="/users/clustering/fuzzy-k-means-commandline.html">Options for Fuzzy 
k-Means</a></li>
+                <li class="nav-header">Clustering Examples</li>
+                <li><a 
href="/users/clustering/clustering-of-synthetic-control-data.html">Synthetic 
data</a></li>
+                <li class="nav-header">Cluster Post processing</li>
+                <li><a href="/users/clustering/cluster-dumper.html">Cluster 
Dumper tool</a></li>
+                <li><a 
href="/users/clustering/visualizing-sample-clusters.html">Cluster 
visualisation</a></li>
+                <li class="nav-header">Recommendations</li>
+                <li><a 
href="/users/recommender/recommender-first-timer-faq.html">First Timer 
FAQ</a></li>
+                <li><a href="/users/recommender/userbased-5-minutes.html">A 
user-based recommender <br/>in 5 minutes</a></li>
+                <li><a 
href="/users/recommender/matrix-factorization.html">Matrix 
factorization-based<br/> recommenders</a></li>
+                <li><a 
href="/users/recommender/recommender-documentation.html">Overview</a></li>
+                <li><a 
href="/users/recommender/intro-itembased-hadoop.html">Intro to item-based 
recommendations<br/> with Hadoop</a></li>
+                <li><a href="/users/recommender/intro-als-hadoop.html">Intro 
to ALS recommendations<br/> with Hadoop</a></li>
+            </ul>
+        </li>
+        <!--  <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Recommendations<b class="caret"></b></a>
+          <ul class="dropdown-menu">
+
+          </ul> -->
+        </li>
+    </ul>
+</div><!--/.nav-collapse -->
+        </div>
+      </div>
+    </div>
+
+</div>
+
+ <div id="sidebar">
+  <div id="sidebar-wrap">
+    <h2>Twitter</h2>
+       <ul class="sidemenu">
+               <li>
+<a class="twitter-timeline" href="https://twitter.com/ApacheMahout"; 
data-widget-id="422861673444028416">Tweets by @ApacheMahout</a>
+<script>!function(d,s,id){var 
js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+"://platform.twitter.com/widgets.js";fjs.parentNode.insertBefore(js,fjs);}}(document,"script","twitter-wjs");</script>
+</li>
+       </ul>
+    <h2>Apache Software Foundation</h2>
+    <ul class="sidemenu">
+      <li><a href="http://www.apache.org/foundation/how-it-works.html";>How the 
ASF works</a></li>
+      <li><a href="http://www.apache.org/foundation/getinvolved.html";>Get 
Involved</a></li>
+      <li><a href="http://www.apache.org/dev/";>Developer Resources</a></li>
+      <li><a 
href="http://www.apache.org/foundation/sponsorship.html";>Sponsorship</a></li>
+      <li><a 
href="http://www.apache.org/foundation/thanks.html";>Thanks</a></li>
+    </ul>
+    <h2>Related Projects</h2>
+    <ul class="sidemenu">
+      <li><a href="http://lucene.apache.org/";>Apache Lucene</a></li>
+      <li><a href="http://hadoop.apache.org/";>Apache Hadoop</a></li>
+      <li><a href="http://bigtop.apache.org/";>Apache Bigtop</a></li>
+      <li><a href="http://spark.apache.org/";>Apache Spark</a></li>
+         <li><a href="http://flink.apache.org/";>Apache Flink</a></li>
+    </ul>
+  </div>
+</div>
+
+  <div id="content-wrap" class="clearfix">
+   <div id="main">
+
+    <p>This is a list of common glossary terms used on both the mailing lists 
and
+around the site. Where possible I have tried to provide a link to more
+in-depth explanations from the web</p>
+
+<table>
+  <tbody>
+    <tr>
+      <td>{children:excerpt=true</td>
+      <td>style=h4}</td>
+    </tr>
+  </tbody>
+</table>
+
+   </div>
+  </div>     
+</div> 
+  <footer class="footer" align="center">
+    <div class="container">
+      <p>
+        Copyright &copy; 2014-2016 The Apache Software Foundation, Licensed 
under
+        the <a href="http://www.apache.org/licenses/LICENSE-2.0";>Apache 
License, Version 2.0</a>.
+        <br />
+                 Apache Mahout, Mahout, Apache, the Apache feather logo, and 
the elephant rider logo are either registered trademarks or trademarks of <a 
href="http://www.apache.org/foundation/marks/";>The Apache Software 
Foundation</a> in the United States and other countries.
+      </p>
+    </div>
+  </footer>
+  
+  <script src="/assets/themes/mahout-retro/js/jquery-1.9.1.min.js"></script>
+  <script src="/assets/themes/mahout-retro/js/bootstrap.min.js"></script>
+  <script>
+    (function() {
+      var cx = '012254517474945470291:vhsfv7eokdc';
+      var gcse = document.createElement('script');
+      gcse.type = 'text/javascript';
+      gcse.async = true;
+      gcse.src = (document.location.protocol == 'https:' ? 'https:' : 'http:') 
+
+          '//www.google.com/cse/cse.js?cx=' + cx;
+      var s = document.getElementsByTagName('script')[0];
+      s.parentNode.insertBefore(gcse, s);
+    })();
+  </script>
+</body>
+</html>
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/0e718ec9/website/oldsite/_site/general/mahout-benchmarks.html
----------------------------------------------------------------------
diff --git a/website/oldsite/_site/general/mahout-benchmarks.html 
b/website/oldsite/_site/general/mahout-benchmarks.html
new file mode 100644
index 0000000..77f9beb
--- /dev/null
+++ b/website/oldsite/_site/general/mahout-benchmarks.html
@@ -0,0 +1,454 @@
+
+
+<!DOCTYPE html>
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+-->
+
+<html xmlns="http://www.w3.org/1999/xhtml"; xml:lang="en" lang="en"><head><meta 
http-equiv="Content-Type" content="text/html; charset=UTF-8">
+  <title>Apache Mahout: Scalable machine learning and data mining</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+  <meta name="Distribution" content="Global">
+  <meta name="Robots" content="index,follow">
+  <meta name="keywords" content="apache, apache hadoop, apache lucene,
+        business data mining, cluster analysis,
+        collaborative filtering, data extraction, data filtering, data 
framework, data integration,
+        data matching, data mining, data mining algorithms, data mining 
analysis, data mining data,
+        data mining introduction, data mining software,
+        data mining techniques, data representation, data set, datamining,
+        feature extraction, fuzzy k means, genetic algorithm, hadoop,
+        hierarchical clustering, high dimensional, introduction to data 
mining, kmeans,
+        knowledge discovery, learning approach, learning approaches, learning 
methods,
+        learning techniques, lucene, machine learning, machine translation, 
mahout apache,
+        mahout taste, map reduce hadoop, mining data, mining methods, naive 
bayes,
+        natural language processing,
+        supervised, text mining, time series data, unsupervised, web data 
mining">
+  <link rel="shortcut icon" type="image/x-icon" 
href="https://mahout.apache.org/images/favicon.ico";>
+  <!--<script type="text/javascript" src="/js/prototype.js"></script>-->
+  <script type="text/javascript" 
src="https://ajax.googleapis.com/ajax/libs/prototype/1.7.2.0/prototype.js";></script>
+  <script type="text/javascript" 
src="/assets/themes/mahout-retro/js/effects.js"></script>
+  <script type="text/javascript" 
src="/assets/themes/mahout-retro/js/search.js"></script>
+  <script type="text/javascript" 
src="/assets/themes/mahout-retro/js/slides.js"></script>
+
+  <link href="/assets/themes/mahout-retro/css/bootstrap.min.css" 
rel="stylesheet" media="screen">
+  <link href="/assets/themes/mahout-retro/css/bootstrap-responsive.css" 
rel="stylesheet">
+  <link rel="stylesheet" href="/assets/themes/mahout-retro/css/global.css" 
type="text/css">
+
+  <!-- mathJax stuff -- use `\(...\)` for inline style math in markdown -->
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    tex2jax: {
+      skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
+    }
+  });
+  MathJax.Hub.Queue(function() {
+    var all = MathJax.Hub.getAllJax(), i;
+    for(i = 0; i < all.length; i += 1) {
+      all[i].SourceElement().parentNode.className += ' has-jax';
+    }
+  });
+  </script>
+  <script type="text/javascript">
+    var mathjax = document.createElement('script'); 
+    mathjax.type = 'text/javascript'; 
+    mathjax.async = true;
+
+    mathjax.src = ('https:' == document.location.protocol) ?
+        
'https://c328740.ssl.cf1.rackcdn.com/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'
 : 
+        
'http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
+       
+         var s = document.getElementsByTagName('script')[0]; 
+    s.parentNode.insertBefore(mathjax, s);
+  </script>
+</head>
+
+<body id="home" data-twttr-rendered="true">
+  <div id="wrap">
+   <div id="header">
+    <div id="logo"><a href="/"><img src="/assets/img/mahout-logo-brudman.png" 
alt="Logos for Mahout and Apache Software Foundation" /></a></div>
+  <div id="search">
+    <form id="search-form" action="http://www.google.com/search"; method="get" 
class="navbar-search pull-right">    
+      <input value="http://mahout.apache.org"; name="sitesearch" type="hidden">
+      <input class="search-query" name="q" id="query" type="text">
+      <input id="submission" type="image" src="/assets/img/mahout-lupe.png" 
alt="Search" />
+    </form>
+  </div>
+ 
+    <div class="navbar navbar-inverse" 
style="position:absolute;top:133px;padding-right:0px;padding-left:0px;">
+      <div class="navbar-inner" style="border: none; background: #999; border: 
none; border-radius: 0px;">
+        <div class="container">
+          <button type="button" class="btn btn-navbar" data-toggle="collapse" 
data-target=".nav-collapse">
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+          </button>
+          <!-- <a class="brand" href="#">Apache Community Development 
Project</a> -->
+            <!--<div class="nav-collapse collapse">-->
+<div class="collapse navbar-collapse" id="main-navbar">
+    <ul class="nav navbar-nav">
+        <!-- <li><a href="/">Home</a></li> -->
+        <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">General<b class="caret"></b></a>
+            <ul class="dropdown-menu">
+                <li><a href="/general/downloads.html">Downloads</a>
+                <li><a href="/general/who-we-are.html">Who we are</a>
+                <li><a 
href="/general/mailing-lists,-irc-and-archives.html">Mailing Lists</a>
+                <li><a href="/general/release-notes.html">Release Notes</a>
+                <li><a href="/general/books-tutorials-and-talks.html">Books, 
Tutorials, Talks</a></li>
+                <li><a href="/general/powered-by-mahout.html">Powered By 
Mahout</a>
+                <li><a href="/general/professional-support.html">Professional 
Support</a>
+                <li class="divider"></li>
+                <li class="nav-header">Resources</li>
+                <li><a href="/general/reference-reading.html">Reference 
Reading</a>
+                <li><a href="/general/faq.html">FAQ</a>
+                <li class="divider"></li>
+                <li class="nav-header">Legal</li>
+                <li><a href="http://www.apache.org/licenses/";>License</a></li>
+                <li><a href="http://www.apache.org/security/";>Security</a></li>
+                <li><a href="/general/privacy-policy.html">Privacy Policy</a>
+            </ul>
+        </li>
+        <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Developers<b class="caret"></b></a>
+            <ul class="dropdown-menu">
+                <li><a href="/developers/developer-resources.html">Developer 
resources</a></li>
+                <li><a href="/developers/version-control.html">Version 
control</a></li>
+                <li><a href="/developers/buildingmahout.html">Build from 
source</a></li>
+                <li><a href="/developers/issue-tracker.html">Issue 
tracker</a></li>
+                <li><a href="https://builds.apache.org/job/Mahout-Quality/"; 
target="_blank">Code quality reports</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Contributions</li>
+                <li><a href="/developers/how-to-contribute.html">How to 
contribute</a></li>
+                <li><a href="/developers/how-to-become-a-committer.html">How 
to become a committer</a></li>
+                <li><a href="/developers/gsoc.html">GSoC</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">For committers</li>
+                <li><a href="/developers/how-to-update-the-website.html">How 
to update the website</a></li>
+                <li><a href="/developers/patch-check-list.html">Patch check 
list</a></li>
+                <li><a href="/developers/github.html">Handling Github 
PRs</a></li>
+                <li><a href="/developers/how-to-release.html">How to 
release</a></li>
+                <li><a href="/developers/thirdparty-dependencies.html">Third 
party dependencies</a></li>
+            </ul>
+        </li>
+        <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Mahout-Samsara<b class="caret"></b></a>
+            <ul class="dropdown-menu">
+                <li><a href="/users/sparkbindings/home.html">Scala &amp; Spark 
Bindings Overview</a></li>
+                <li><a href="/users/sparkbindings/faq.html">FAQ</a></li>
+                <li><a 
href="/users/flinkbindings/playing-with-samsara-flink.html">Flink Bindings 
Overview</a></li>
+                <li class="nav-header">Engines</li>
+                <li><a href="/users/sparkbindings/home.html">Spark</a></li>
+                <li><a 
href="/users/environment/h2o-internals.html">H2O</a></li>
+                <li><a 
href="/users/flinkbindings/flink-internals.html">Flink</a></li>
+                <li class="nav-header">References</li>
+                <li><a 
href="/users/environment/in-core-reference.html">In-Core Algebraic DSL 
Reference</a></li>
+                <li><a 
href="/users/environment/out-of-core-reference.html">Distributed Algebraic DSL 
Reference</a></li>
+                <li class="nav-header">Tutorials</li>
+                <li><a 
href="/users/sparkbindings/play-with-shell.html">Playing with Mahout's Spark 
Shell</a></li>
+                <li><a href="/users/environment/how-to-build-an-app.html">How 
to build an app</a></li>
+                <li><a 
href="/users/environment/classify-a-doc-from-the-shell.html">Building a text 
classifier in Mahout's Spark Shell</a></li>
+            </ul>
+        </li>
+        <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Algorithms<b class="caret"></b></a>
+            <ul class="dropdown-menu">
+                <li><a href="/users/basics/algorithms.html">List of 
algorithms</a>
+                <li class="nav-header">Distributed Matrix Decomposition</li>
+                <li><a href="/users/algorithms/d-qr.html">Cholesky QR</a></li>
+                <li><a href="/users/algorithms/d-ssvd.html">SSVD</a></li>
+                <li><a href="/users/algorithms/d-als.html">Distributed 
ALS</a></li>
+                <li><a href="/users/algorithms/d-spca.html">SPCA</a></li>
+                <li class="nav-header">Recommendations</li>
+                <li><a 
href="/users/algorithms/recommender-overview.html">Recommender Overview</a></li>
+                <li><a 
href="/users/algorithms/intro-cooccurrence-spark.html">Intro to 
cooccurrence-based<br/> recommendations with Spark</a></li>
+                <li class="nav-header">Classification</li>
+                <li><a href="/users/algorithms/spark-naive-bayes.html">Spark 
Naive Bayes</a></li>
+            </ul>
+        </li>
+        <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">MapReduce Basics<b class="caret"></b></a>
+            <ul class="dropdown-menu">
+                <li><a href="/users/basics/algorithms.html">List of 
algorithms</a>
+                <li><a href="/users/basics/quickstart.html">Overview</a>
+                <li class="divider"></li>
+                <li class="nav-header">Working with text</li>
+                <li><a 
href="/users/basics/creating-vectors-from-text.html">Creating vectors from 
text</a>
+                <li><a href="/users/basics/collocations.html">Collocations</a>
+                <li class="divider"></li>
+                <li class="nav-header">Dimensionality reduction</li>
+                <li><a 
href="/users/dim-reduction/dimensional-reduction.html">Singular Value 
Decomposition</a></li>
+                <li><a href="/users/dim-reduction/ssvd.html">Stochastic 
SVD</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Topic Models</li>
+                <li><a 
href="/users/clustering/latent-dirichlet-allocation.html">Latent Dirichlet 
Allocation</a></li>
+            </ul>
+        </li>
+        <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Mahout MapReduce<b class="caret"></b></a>
+            <ul class="dropdown-menu">
+                <li class="nav-header">Classification</li>
+                <li><a href="/users/classification/bayesian.html">Naive 
Bayes</a></li>
+                <li><a 
href="/users/classification/hidden-markov-models.html">Hidden Markov 
Models</a></li>
+                <li><a 
href="/users/classification/logistic-regression.html">Logistic Regression 
(Single Machine)</a></li>
+                <li><a 
href="/users/classification/partial-implementation.html">Random Forest</a></li>
+                <li class="nav-header">Classification Examples</li>
+                <li><a 
href="/users/classification/breiman-example.html">Breiman example</a></li>
+                <li><a href="/users/classification/twenty-newsgroups.html">20 
newsgroups example</a></li>
+                <li><a 
href="/users/classification/bankmarketing-example.html">SGD classifier bank 
marketing</a></li>
+                <li><a 
href="/users/classification/wikipedia-classifier-example.html">Wikipedia XML 
parser and classifier</a></li>
+                <li class="nav-header">Clustering</li>
+                <li><a 
href="/users/clustering/k-means-clustering.html">k-Means</a></li>
+                <li><a 
href="/users/clustering/canopy-clustering.html">Canopy</a></li>
+                <li><a href="/users/clustering/fuzzy-k-means.html">Fuzzy 
k-Means</a></li>
+                <li><a 
href="/users/clustering/streaming-k-means.html">Streaming KMeans</a></li>
+                <li><a 
href="/users/clustering/spectral-clustering.html">Spectral Clustering</a></li>
+                <li class="nav-header">Clustering Commandline usage</li>
+                <li><a 
href="/users/clustering/k-means-commandline.html">Options for k-Means</a></li>
+                <li><a 
href="/users/clustering/canopy-commandline.html">Options for Canopy</a></li>
+                <li><a 
href="/users/clustering/fuzzy-k-means-commandline.html">Options for Fuzzy 
k-Means</a></li>
+                <li class="nav-header">Clustering Examples</li>
+                <li><a 
href="/users/clustering/clustering-of-synthetic-control-data.html">Synthetic 
data</a></li>
+                <li class="nav-header">Cluster Post processing</li>
+                <li><a href="/users/clustering/cluster-dumper.html">Cluster 
Dumper tool</a></li>
+                <li><a 
href="/users/clustering/visualizing-sample-clusters.html">Cluster 
visualisation</a></li>
+                <li class="nav-header">Recommendations</li>
+                <li><a 
href="/users/recommender/recommender-first-timer-faq.html">First Timer 
FAQ</a></li>
+                <li><a href="/users/recommender/userbased-5-minutes.html">A 
user-based recommender <br/>in 5 minutes</a></li>
+                <li><a 
href="/users/recommender/matrix-factorization.html">Matrix 
factorization-based<br/> recommenders</a></li>
+                <li><a 
href="/users/recommender/recommender-documentation.html">Overview</a></li>
+                <li><a 
href="/users/recommender/intro-itembased-hadoop.html">Intro to item-based 
recommendations<br/> with Hadoop</a></li>
+                <li><a href="/users/recommender/intro-als-hadoop.html">Intro 
to ALS recommendations<br/> with Hadoop</a></li>
+            </ul>
+        </li>
+        <!--  <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Recommendations<b class="caret"></b></a>
+          <ul class="dropdown-menu">
+
+          </ul> -->
+        </li>
+    </ul>
+</div><!--/.nav-collapse -->
+        </div>
+      </div>
+    </div>
+
+</div>
+
+ <div id="sidebar">
+  <div id="sidebar-wrap">
+    <h2>Twitter</h2>
+       <ul class="sidemenu">
+               <li>
+<a class="twitter-timeline" href="https://twitter.com/ApacheMahout"; 
data-widget-id="422861673444028416">Tweets by @ApacheMahout</a>
+<script>!function(d,s,id){var 
js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+"://platform.twitter.com/widgets.js";fjs.parentNode.insertBefore(js,fjs);}}(document,"script","twitter-wjs");</script>
+</li>
+       </ul>
+    <h2>Apache Software Foundation</h2>
+    <ul class="sidemenu">
+      <li><a href="http://www.apache.org/foundation/how-it-works.html";>How the 
ASF works</a></li>
+      <li><a href="http://www.apache.org/foundation/getinvolved.html";>Get 
Involved</a></li>
+      <li><a href="http://www.apache.org/dev/";>Developer Resources</a></li>
+      <li><a 
href="http://www.apache.org/foundation/sponsorship.html";>Sponsorship</a></li>
+      <li><a 
href="http://www.apache.org/foundation/thanks.html";>Thanks</a></li>
+    </ul>
+    <h2>Related Projects</h2>
+    <ul class="sidemenu">
+      <li><a href="http://lucene.apache.org/";>Apache Lucene</a></li>
+      <li><a href="http://hadoop.apache.org/";>Apache Hadoop</a></li>
+      <li><a href="http://bigtop.apache.org/";>Apache Bigtop</a></li>
+      <li><a href="http://spark.apache.org/";>Apache Spark</a></li>
+         <li><a href="http://flink.apache.org/";>Apache Flink</a></li>
+    </ul>
+  </div>
+</div>
+
+  <div id="content-wrap" class="clearfix">
+   <div id="main">
+
+    <p><a name="MahoutBenchmarks-Introduction"></a></p>
+<h1 id="introduction">Introduction</h1>
+
+<p>Depending on hardware configuration, exact distribution of ratings over 
users and items YMMV!</p>
+
+<p><a name="MahoutBenchmarks-Recommenders"></a></p>
+<h1 id="recommenders">Recommenders</h1>
+
+<p><a name="MahoutBenchmarks-ARuleofThumb"></a></p>
+<h2 id="a-rule-of-thumb">A Rule of Thumb</h2>
+
+<p>100M preferences are about the data set size where non-distributed
+recommenders will outgrow a normal-sized machine (32-bit, &lt;= 4GB RAM). Your
+mileage will vary significantly with the nature of the data.</p>
+
+<p><a 
name="MahoutBenchmarks-Distributedrecommendervs.Wikipedialinks(May272010)"></a></p>
+<h2 id="distributed-recommender-vs-wikipedia-links-may-27-2010">Distributed 
recommender vs. Wikipedia links (May 27 2010)</h2>
+
+<p>From the mailing list:</p>
+
+<p>I just finished running a set of recommendations based on the Wikipedia
+link graph, for book purposes (yeah, it’s unconventional). I ran on my
+laptop, but it ought to be crudely representative of how it runs in a real
+cluster.</p>
+
+<p>The input is 1058MB as a text file, and contains, 130M article-article
+associations, from 5.7M articles to 3.8M distinct articles (“users” and
+“items”, respectively). I estimate cost based on Amazon’s North
+American small Linux-based instance pricing of $0.085/hour. I ran on a
+dual-core laptop with plenty of RAM, allowing 1GB per worker, so this is
+valid.</p>
+
+<p>In this run, I run recommendations for all 5.7M “users”. You can 
certainly
+run for any subset of all users of course.</p>
+
+<p>Phase 1 (Item ID to item index mapping)
+29 minutes CPU time
+$0.05
+60MB output</p>
+
+<p>Phase 2 (Create user vectors)
+88 minutes CPU time
+$0.13
+Output: 1159MB</p>
+
+<p>Phase 3 (Count co-occurrence)
+77 hours CPU time
+$6.54
+Output: 23.6GB</p>
+
+<p>Phase 4 (Partial multiply prep)
+10.5 hours CPU time
+$0.90
+Output: 24.6GB</p>
+
+<p>Phase 5 (Aggregate and recommend)
+about 600 hours
+about $51.00
+about 10GB
+(I estimated these rather than let it run at home for days!)</p>
+
+<p>Note that phases 1 and 3 may be run less frequently, and need not be run
+every time. But the cost is dominated by the last step, which is most of
+the work. I’ve ignored storage costs.</p>
+
+<p>This implies a cost of $0.01 (or about 8 instance-minutes) per 1,000 user
+recommendations. That’s not bad if, say, you want to update recs for you
+site’s 100,000 daily active users for a dollar.</p>
+
+<p>There are several levers one could pull internally to sacrifice accuracy
+for speed, but it’s currently set to pretty normal values. So this is just
+one possibility.</p>
+
+<p>Now that’s not terrible, but it is about 8x more computing than would be
+needed by a non-distributed implementation <em>if</em> you could fit the whole
+data set into a very large instance’s memory, which is still possible at
+this scale but needs a pretty big instance. That’s a very apples-to-oranges
+comparison of course; different algorithms, entirely different
+environments. This is about the amount of overhead I’d expect from
+distributing – interesting to note how non-trivial it is.</p>
+
+<p><a 
name="MahoutBenchmarks-Non-distributedrecommendervs.KDDCupdataset(March2011)"></a></p>
+<h2 
id="non-distributed-recommender-vs-kdd-cup-data-set-march-2011">Non-distributed 
recommender vs. KDD Cup data set (March 2011)</h2>
+
+<p>(From the [email protected] mailing list)</p>
+
+<p>I’ve been test-driving a simple application of Mahout recommenders (the
+non-distributed kind) on Amazon EC2 on the new Yahoo KDD Cup data set
+(kddcup.yahoo.com).</p>
+
+<p>In the spirit of open-source, like I mentioned, I’m committing the extra
+code to mahout-examples that can be used to run a Recommender on the input
+and output the right format. And, I’d like to publish the rough timings
+too. Find all the source in org.apache.mahout.cf.taste.example.kddcup</p>
+
+<p><a name="MahoutBenchmarks-Track1"></a></p>
+<h3 id="track-1">Track 1</h3>
+
+<ul>
+  <li>m2.2xlarge instance, 34.2GB RAM / 4 cores</li>
+  <li>Steady state memory consumption: ~19GB</li>
+  <li>Computation time: 30 hours (wall clock-time)</li>
+  <li>CPU time per user: ~0.43 sec</li>
+  <li>Cost on EC2: $34.20 (!)</li>
+</ul>
+
+<p>(Helpful hint on cost I realized after the fact: you can almost surely get
+spot instances for cheaper. The maximum price this sort of instance has
+gone for as a spot instance is about $0.60/hour, vs “retail price” of
+$1.14/hour.)</p>
+
+<p>Resulted in an RMSE of 29.5618 (the rating scale is 0-100), which is only
+good enough for 29th place at the moment. Not terrible for “out of the box”
+performance – it’s just using an item-based recommender with uncentered
+cosine similarity. But not really good in absolute terms. A winning
+solution is going to try to factor in time, and apply more sophisticated
+techniques. The best RMSE so far is about 23.</p>
+
+<p><a name="MahoutBenchmarks-Track2"></a></p>
+<h3 id="track-2">Track 2</h3>
+
+<ul>
+  <li>c1.xlarge instance: 7GB RAM / 8 cores</li>
+  <li>Steady state memory consumption: ~3.8GB</li>
+  <li>Computation time: 4.1 hours (wall clock-time)</li>
+  <li>CPU time per user: ~1.1 sec</li>
+  <li>Cost on EC2: $3.20</li>
+</ul>
+
+<p>For this I bothered to write a simplistic item-item similarity metric to
+take into account the additional info that is available: track, artist,
+album, genre. The result was comparatively better: 17.92% error rate, good
+enough for 4th place at the moment.</p>
+
+<p>Of course, the next task is to put this through the actual distributed
+processing – that’s really the appropriate solution.</p>
+
+<p>This shows you can still tackle fairly impressive scale with a
+non-distributed solution. These results suggest that the largest instances
+available from EC2 would accomodate almost 1 billion ratings in memory.
+However at that scale running a user’s full recommendations would easily be
+measured in seconds, not milliseconds.</p>
+
+<p><a name="MahoutBenchmarks-Clustering"></a></p>
+<h1 id="clustering">Clustering</h1>
+
+<p>See <a 
href="https://issues.apache.org/jira/browse/MAHOUT-588";>MAHOUT-588</a></p>
+
+
+   </div>
+  </div>     
+</div> 
+  <footer class="footer" align="center">
+    <div class="container">
+      <p>
+        Copyright &copy; 2014-2016 The Apache Software Foundation, Licensed 
under
+        the <a href="http://www.apache.org/licenses/LICENSE-2.0";>Apache 
License, Version 2.0</a>.
+        <br />
+                 Apache Mahout, Mahout, Apache, the Apache feather logo, and 
the elephant rider logo are either registered trademarks or trademarks of <a 
href="http://www.apache.org/foundation/marks/";>The Apache Software 
Foundation</a> in the United States and other countries.
+      </p>
+    </div>
+  </footer>
+  
+  <script src="/assets/themes/mahout-retro/js/jquery-1.9.1.min.js"></script>
+  <script src="/assets/themes/mahout-retro/js/bootstrap.min.js"></script>
+  <script>
+    (function() {
+      var cx = '012254517474945470291:vhsfv7eokdc';
+      var gcse = document.createElement('script');
+      gcse.type = 'text/javascript';
+      gcse.async = true;
+      gcse.src = (document.location.protocol == 'https:' ? 'https:' : 'http:') 
+
+          '//www.google.com/cse/cse.js?cx=' + cx;
+      var s = document.getElementsByTagName('script')[0];
+      s.parentNode.insertBefore(gcse, s);
+    })();
+  </script>
+</body>
+</html>
+

Reply via email to