http://git-wip-us.apache.org/repos/asf/incubator-hivemall-site/blob/68241a08/userguide/sitemap.xml ---------------------------------------------------------------------- diff --git a/userguide/sitemap.xml b/userguide/sitemap.xml index 033bdd0..9717f81 100644 --- a/userguide/sitemap.xml +++ b/userguide/sitemap.xml @@ -51,6 +51,7 @@ <url> <loc>http://hivemall.incubator.apache.org/binaryclass/webspam.html</loc> <changefreq>weekly</changefreq> <priority>0.5</priority> </url> <url> <loc>http://hivemall.incubator.apache.org/binaryclass/webspam_dataset.html</loc> <changefreq>weekly</changefreq> <priority>0.5</priority> </url> <url> <loc>http://hivemall.incubator.apache.org/binaryclass/webspam_scw.html</loc> <changefreq>weekly</changefreq> <priority>0.5</priority> </url> +<url> <loc>http://hivemall.incubator.apache.org/binaryclass/titanic_rf.html</loc> <changefreq>weekly</changefreq> <priority>0.5</priority> </url> <url> <loc>http://hivemall.incubator.apache.org/multiclass/news20.html</loc> <changefreq>weekly</changefreq> <priority>0.5</priority> </url> <url> <loc>http://hivemall.incubator.apache.org/multiclass/news20_dataset.html</loc> <changefreq>weekly</changefreq> <priority>0.5</priority> </url> <url> <loc>http://hivemall.incubator.apache.org/multiclass/news20_one-vs-the-rest_dataset.html</loc> <changefreq>weekly</changefreq> <priority>0.5</priority> </url>
http://git-wip-us.apache.org/repos/asf/incubator-hivemall-site/blob/68241a08/userguide/tips/addbias.html ---------------------------------------------------------------------- diff --git a/userguide/tips/addbias.html b/userguide/tips/addbias.html index c953de8..9348488 100644 --- a/userguide/tips/addbias.html +++ b/userguide/tips/addbias.html @@ -999,6 +999,21 @@ </li> + <li class="chapter " data-level="5.6" data-path="../binaryclass/titanic_rf.html"> + + <a href="../binaryclass/titanic_rf.html"> + + + <b>5.6.</b> + + Kaggle Titanic Tutorial + + </a> + + + + </li> + @@ -1657,7 +1672,7 @@ Then, the predicted model considers bias existing in the dataset and the predicted hyperplane does not always cross the origin.</p> <p><strong>addBias()</strong> of Hivemall, adds a bias to a feature vector. To enable a bias clause, use addBias() for <strong>both</strong><em>(important!)</em> training and test data as follows. -The bias <em>b</em> is a feature of "0" ("-1" in before v0.3) by the default. See <a href="https://github.com/myui/hivemall/blob/master/src/main/hivemall/ftvec/AddBiasUDF.java" target="_blank">AddBiasUDF</a> for the detail.</p> +The bias <em>b</em> is a feature of "0" ("-1" in before v0.3) by the default. See <a href="addbias.html">AddBiasUDF</a> for the detail.</p> <p>Note that Bias is expressed as a feature that found in all training/testing examples.</p> <h1 id="adding-a-bias-clause-to-test-data">Adding a bias clause to test data</h1> <pre><code class="lang-sql"><span class="hljs-keyword">create</span> <span class="hljs-keyword">table</span> e2006tfidf_test_exploded <span class="hljs-keyword">as</span> @@ -1683,7 +1698,25 @@ from e2006tfidf_train_x3 ) t group by feature; -</code></pre><p><div id="page-footer"><hr><p><sub><font color="gray"> +</code></pre><p><div id="page-footer"><hr><!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<p><sub><font color="gray"> Apache Hivemall is an effort undergoing incubation at The Apache Software Foundation (ASF), sponsored by the Apache Incubator. </font></sub></p> </div></p> @@ -1720,7 +1753,7 @@ Apache Hivemall is an effort undergoing incubation at The Apache Software Founda <script> var gitbook = gitbook || []; gitbook.push(function() { - gitbook.page.hasChanged({"page":{"title":"Explicit addBias() for better prediction","level":"1.3.1","depth":2,"next":{"title":"Use rand_amplify() to better prediction results","level":"1.3.2","depth":2,"path":"tips/rand_amplify.md","ref":"tips/rand_amplify.md","articles":[]},"previous":{"title":"Tips for Effective Hivemall","level":"1.3","depth":1,"path":"tips/README.md","ref":"tips/README.md","articles":[{"title":"Explicit addBias() for better prediction","level":"1.3.1","depth":2,"path":"tips/addbias.md","ref":"tips/addbias.md","articles":[]},{"title":"Use rand_amplify() to better prediction results","level":"1.3.2","depth":2,"path":"tips/rand_amplify.md","ref":"tips/rand_amplify.md","articles":[]},{"title":"Real-time Prediction on RDBMS","level":"1.3.3","depth":2,"path":"tips/rt_prediction.md","ref":"tips/rt_prediction.md","articles":[]},{"title":"Ensemble learning for stable prediction","level":"1.3.4","depth":2,"path":"tips/ensemble_learning.md","ref":"tips/ensemble _learning.md","articles":[]},{"title":"Mixing models for a better prediction convergence (MIX server)","level":"1.3.5","depth":2,"path":"tips/mixserver.md","ref":"tips/mixserver.md","articles":[]},{"title":"Run Hivemall on Amazon Elastic MapReduce","level":"1.3.6","depth":2,"path":"tips/emr.md","ref":"tips/emr.md","articles":[]}]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.com/apache/incubator-hivemall/"},"splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator- hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":true},"anchorjs":{"selector":"h1,h2,h3,*:not(.callout) > h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"them e":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User Manual for Apache Hivemall"},"file":{"path":"tips/addbias.md","mtime":"2016-11-12T07:18:00.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"2016-11-14T10:40:22.987Z"},"basePath":"..","book":{"language":""}}); + gitbook.page.hasChanged({"page":{"title":"Explicit addBias() for better prediction","level":"1.3.1","depth":2,"next":{"title":"Use rand_amplify() to better prediction results","level":"1.3.2","depth":2,"path":"tips/rand_amplify.md","ref":"tips/rand_amplify.md","articles":[]},"previous":{"title":"Tips for Effective Hivemall","level":"1.3","depth":1,"path":"tips/README.md","ref":"tips/README.md","articles":[{"title":"Explicit addBias() for better prediction","level":"1.3.1","depth":2,"path":"tips/addbias.md","ref":"tips/addbias.md","articles":[]},{"title":"Use rand_amplify() to better prediction results","level":"1.3.2","depth":2,"path":"tips/rand_amplify.md","ref":"tips/rand_amplify.md","articles":[]},{"title":"Real-time Prediction on RDBMS","level":"1.3.3","depth":2,"path":"tips/rt_prediction.md","ref":"tips/rt_prediction.md","articles":[]},{"title":"Ensemble learning for stable prediction","level":"1.3.4","depth":2,"path":"tips/ensemble_learning.md","ref":"tips/ensemble _learning.md","articles":[]},{"title":"Mixing models for a better prediction convergence (MIX server)","level":"1.3.5","depth":2,"path":"tips/mixserver.md","ref":"tips/mixserver.md","articles":[]},{"title":"Run Hivemall on Amazon Elastic MapReduce","level":"1.3.6","depth":2,"path":"tips/emr.md","ref":"tips/emr.md","articles":[]}]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.com/apache/incubator-hivemall/"},"splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator- hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":true},"anchorjs":{"selector":"h1,h2,h3,*:not(.callout) > h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"them e":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User Manual for Apache Hivemall"},"file":{"path":"tips/addbias.md","mtime":"2016-11-17T10:44:58.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"2016-11-17T12:16:14.647Z"},"basePath":"..","book":{"language":""}}); }); </script> </div> http://git-wip-us.apache.org/repos/asf/incubator-hivemall-site/blob/68241a08/userguide/tips/emr.html ---------------------------------------------------------------------- diff --git a/userguide/tips/emr.html b/userguide/tips/emr.html index d428833..3357933 100644 --- a/userguide/tips/emr.html +++ b/userguide/tips/emr.html @@ -999,6 +999,21 @@ </li> + <li class="chapter " data-level="5.6" data-path="../binaryclass/titanic_rf.html"> + + <a href="../binaryclass/titanic_rf.html"> + + + <b>5.6.</b> + + Kaggle Titanic Tutorial + + </a> + + + + </li> + @@ -1649,6 +1664,22 @@ specific language governing permissions and limitations under the License. --> +<!-- toc --><div id="toc" class="toc"> + +<ul> +<li><a href="#prerequisite">Prerequisite</a></li> +<li><a href="#data-preparation">Data preparation</a><ul> +<li><a href="#adaptive-regularization-of-weight-vectors-arow">Adaptive Regularization of Weight Vectors (AROW)</a></li> +</ul> +</li> +<li><a href="#training">training</a></li> +<li><a href="#prediction">prediction</a></li> +<li><a href="#evaluation">evaluation</a></li> +<li><a href="#cleaning">Cleaning</a></li> +<li><a href="#tips">Tips</a></li> +</ul> + +</div><!-- tocstop --> <h2 id="prerequisite">Prerequisite</h2> <p>Learn how to use Hive with Elastic MapReduce (EMR).<br><a href="http://docs.aws.amazon.com/ElasticMapReduce/latest/DeveloperGuide/emr-hive.html" target="_blank">http://docs.aws.amazon.com/ElasticMapReduce/latest/DeveloperGuide/emr-hive.html</a></p> <p>Before launching an EMR job, </p> @@ -1809,7 +1840,25 @@ LOCATION <span class="hljs-string">'s3://${s3bucket}/emr/outputs/news20b_ar --bootstrap-name "install ganglia" \ --availability-zone ap-northeast-1a </code></pre><p>Using spot instance for core/task instance groups is the best way to save your money. -<div id="page-footer"><hr><p><sub><font color="gray"> +<div id="page-footer"><hr><!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<p><sub><font color="gray"> Apache Hivemall is an effort undergoing incubation at The Apache Software Foundation (ASF), sponsored by the Apache Incubator. </font></sub></p> </div></p> @@ -1846,7 +1895,7 @@ Apache Hivemall is an effort undergoing incubation at The Apache Software Founda <script> var gitbook = gitbook || []; gitbook.push(function() { - gitbook.page.hasChanged({"page":{"title":"Run Hivemall on Amazon Elastic MapReduce","level":"1.3.6","depth":2,"next":{"title":"General Hive/Hadoop tips","level":"1.4","depth":1,"path":"tips/general_tips.md","ref":"tips/general_tips.md","articles":[{"title":"Adding rowid for each row","level":"1.4.1","depth":2,"path":"tips/rowid.md","ref":"tips/rowid.md","articles":[]},{"title":"Hadoop tuning for Hivemall","level":"1.4.2","depth":2,"path":"tips/hadoop_tuning.md","ref":"tips/hadoop_tuning.md","articles":[]}]},"previous":{"title":"Mixing models for a better prediction convergence (MIX server)","level":"1.3.5","depth":2,"path":"tips/mixserver.md","ref":"tips/mixserver.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css"," pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.com/apache/incubator-hivemall/"},"splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"http s://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":true},"anchorjs":{"selector":"h1,h2,h3,*:not(.callout) > h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User Manual for Apache Hivemall"},"file":{"path":"tips/emr.md","mtime":"2016-11-14T09:51:55.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time": "2016-11-14T10:40:22.987Z"},"basePath":"..","book":{"language":""}}); + gitbook.page.hasChanged({"page":{"title":"Run Hivemall on Amazon Elastic MapReduce","level":"1.3.6","depth":2,"next":{"title":"General Hive/Hadoop tips","level":"1.4","depth":1,"path":"tips/general_tips.md","ref":"tips/general_tips.md","articles":[{"title":"Adding rowid for each row","level":"1.4.1","depth":2,"path":"tips/rowid.md","ref":"tips/rowid.md","articles":[]},{"title":"Hadoop tuning for Hivemall","level":"1.4.2","depth":2,"path":"tips/hadoop_tuning.md","ref":"tips/hadoop_tuning.md","articles":[]}]},"previous":{"title":"Mixing models for a better prediction convergence (MIX server)","level":"1.3.5","depth":2,"path":"tips/mixserver.md","ref":"tips/mixserver.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css"," pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.com/apache/incubator-hivemall/"},"splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"http s://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":true},"anchorjs":{"selector":"h1,h2,h3,*:not(.callout) > h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User Manual for Apache Hivemall"},"file":{"path":"tips/emr.md","mtime":"2016-11-17T12:10:52.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time": "2016-11-17T12:16:14.647Z"},"basePath":"..","book":{"language":""}}); }); </script> </div> http://git-wip-us.apache.org/repos/asf/incubator-hivemall-site/blob/68241a08/userguide/tips/ensemble_learning.html ---------------------------------------------------------------------- diff --git a/userguide/tips/ensemble_learning.html b/userguide/tips/ensemble_learning.html index 491233d..57ddf68 100644 --- a/userguide/tips/ensemble_learning.html +++ b/userguide/tips/ensemble_learning.html @@ -999,6 +999,21 @@ </li> + <li class="chapter " data-level="5.6" data-path="../binaryclass/titanic_rf.html"> + + <a href="../binaryclass/titanic_rf.html"> + + + <b>5.6.</b> + + Kaggle Titanic Tutorial + + </a> + + + + </li> + @@ -1860,7 +1875,25 @@ where actual == predicted; </tr> </tbody> </table> -<p><div id="page-footer"><hr><p><sub><font color="gray"> +<p><div id="page-footer"><hr><!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<p><sub><font color="gray"> Apache Hivemall is an effort undergoing incubation at The Apache Software Foundation (ASF), sponsored by the Apache Incubator. </font></sub></p> </div></p> @@ -1897,7 +1930,7 @@ Apache Hivemall is an effort undergoing incubation at The Apache Software Founda <script> var gitbook = gitbook || []; gitbook.push(function() { - gitbook.page.hasChanged({"page":{"title":"Ensemble learning for stable prediction","level":"1.3.4","depth":2,"next":{"title":"Mixing models for a better prediction convergence (MIX server)","level":"1.3.5","depth":2,"path":"tips/mixserver.md","ref":"tips/mixserver.md","articles":[]},"previous":{"title":"Real-time Prediction on RDBMS","level":"1.3.3","depth":2,"path":"tips/rt_prediction.md","ref":"tips/rt_prediction.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.c om/apache/incubator-hivemall/"},"splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel": true},"anchorjs":{"selector":"h1,h2,h3,*:not(.callout) > h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User Manual for Apache Hivemall"},"file":{"path":"tips/ensemble_learning.md","mtime":"2016-11-12T07:18:00.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"2016-11-14T10:40:22.987Z"},"basePath":"..","book":{"language":""}}); + gitbook.page.hasChanged({"page":{"title":"Ensemble learning for stable prediction","level":"1.3.4","depth":2,"next":{"title":"Mixing models for a better prediction convergence (MIX server)","level":"1.3.5","depth":2,"path":"tips/mixserver.md","ref":"tips/mixserver.md","articles":[]},"previous":{"title":"Real-time Prediction on RDBMS","level":"1.3.3","depth":2,"path":"tips/rt_prediction.md","ref":"tips/rt_prediction.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.c om/apache/incubator-hivemall/"},"splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel": true},"anchorjs":{"selector":"h1,h2,h3,*:not(.callout) > h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User Manual for Apache Hivemall"},"file":{"path":"tips/ensemble_learning.md","mtime":"2016-11-16T08:39:12.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"2016-11-17T12:16:14.647Z"},"basePath":"..","book":{"language":""}}); }); </script> </div> http://git-wip-us.apache.org/repos/asf/incubator-hivemall-site/blob/68241a08/userguide/tips/general_tips.html ---------------------------------------------------------------------- diff --git a/userguide/tips/general_tips.html b/userguide/tips/general_tips.html index 031bb41..354c524 100644 --- a/userguide/tips/general_tips.html +++ b/userguide/tips/general_tips.html @@ -999,6 +999,21 @@ </li> + <li class="chapter " data-level="5.6" data-path="../binaryclass/titanic_rf.html"> + + <a href="../binaryclass/titanic_rf.html"> + + + <b>5.6.</b> + + Kaggle Titanic Tutorial + + </a> + + + + </li> + @@ -1649,7 +1664,25 @@ specific language governing permissions and limitations under the License. --> -<p><div id="page-footer"><hr><p><sub><font color="gray"> +<p><div id="page-footer"><hr><!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<p><sub><font color="gray"> Apache Hivemall is an effort undergoing incubation at The Apache Software Foundation (ASF), sponsored by the Apache Incubator. </font></sub></p> </div></p> @@ -1686,7 +1719,7 @@ Apache Hivemall is an effort undergoing incubation at The Apache Software Founda <script> var gitbook = gitbook || []; gitbook.push(function() { - gitbook.page.hasChanged({"page":{"title":"General Hive/Hadoop tips","level":"1.4","depth":1,"next":{"title":"Adding rowid for each row","level":"1.4.1","depth":2,"path":"tips/rowid.md","ref":"tips/rowid.md","articles":[]},"previous":{"title":"Run Hivemall on Amazon Elastic MapReduce","level":"1.3.6","depth":2,"path":"tips/emr.md","ref":"tips/emr.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.com/apache/incubator-hivemall/"},"splitter":{},"search":{},"downloadpdf" :{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":true},"anchorjs":{"selector":"h1,h2,h3,*:not(.callout) > h4,h5"},"toggl e-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User Manual for Apache Hivemall"},"file":{"path":"tips/general_tips.md","mtime":"2016-11-12T07:18:00.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"2016-11-14T10:40:22.987Z"},"basePath":"..","book":{"language":""}}); + gitbook.page.hasChanged({"page":{"title":"General Hive/Hadoop tips","level":"1.4","depth":1,"next":{"title":"Adding rowid for each row","level":"1.4.1","depth":2,"path":"tips/rowid.md","ref":"tips/rowid.md","articles":[]},"previous":{"title":"Run Hivemall on Amazon Elastic MapReduce","level":"1.3.6","depth":2,"path":"tips/emr.md","ref":"tips/emr.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.com/apache/incubator-hivemall/"},"splitter":{},"search":{},"downloadpdf" :{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":true},"anchorjs":{"selector":"h1,h2,h3,*:not(.callout) > h4,h5"},"toggl e-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User Manual for Apache Hivemall"},"file":{"path":"tips/general_tips.md","mtime":"2016-11-16T08:39:12.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"2016-11-17T12:16:14.647Z"},"basePath":"..","book":{"language":""}}); }); </script> </div> http://git-wip-us.apache.org/repos/asf/incubator-hivemall-site/blob/68241a08/userguide/tips/hadoop_tuning.html ---------------------------------------------------------------------- diff --git a/userguide/tips/hadoop_tuning.html b/userguide/tips/hadoop_tuning.html index 986d0ae..0e4675d 100644 --- a/userguide/tips/hadoop_tuning.html +++ b/userguide/tips/hadoop_tuning.html @@ -999,6 +999,21 @@ </li> + <li class="chapter " data-level="5.6" data-path="../binaryclass/titanic_rf.html"> + + <a href="../binaryclass/titanic_rf.html"> + + + <b>5.6.</b> + + Kaggle Titanic Tutorial + + </a> + + + + </li> + @@ -1649,6 +1664,17 @@ specific language governing permissions and limitations under the License. --> +<!-- toc --><div id="toc" class="toc"> + +<ul> +<li><a href="#prerequisites">Prerequisites</a></li> +<li><a href="#mapper-side-configuration">Mapper-side configuration</a></li> +<li><a href="#reducer-side-configuration">Reducer-side configuration</a></li> +<li><a href="#formula-to-estimate-consumed-memory-in-hivemall">Formula to estimate consumed memory in Hivemall</a></li> +<li><a href="#execution-engine-of-hive">Execution Engine of Hive</a></li> +</ul> + +</div><!-- tocstop --> <h1 id="prerequisites">Prerequisites</h1> <p>Please refer the following guides for Hadoop tuning:</p> <ul> @@ -1707,7 +1733,25 @@ mapred.reduce.shuffle.input.buffer.percent=0.6 (MR v1) <pre><code class="lang-sql"><span class="hljs-keyword">set</span> mapreduce.framework.<span class="hljs-keyword">name</span>=yarn; <span class="hljs-keyword">set</span> hive.execution.<span class="hljs-keyword">engine</span>=mr; </code></pre> -<p><div id="page-footer"><hr><p><sub><font color="gray"> +<p><div id="page-footer"><hr><!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<p><sub><font color="gray"> Apache Hivemall is an effort undergoing incubation at The Apache Software Foundation (ASF), sponsored by the Apache Incubator. </font></sub></p> </div></p> @@ -1744,7 +1788,7 @@ Apache Hivemall is an effort undergoing incubation at The Apache Software Founda <script> var gitbook = gitbook || []; gitbook.push(function() { - gitbook.page.hasChanged({"page":{"title":"Hadoop tuning for Hivemall","level":"1.4.2","depth":2,"next":{"title":"Troubleshooting","level":"1.5","depth":1,"path":"troubleshooting/README.md","ref":"troubleshooting/README.md","articles":[{"title":"OutOfMemoryError in training","level":"1.5.1","depth":2,"path":"troubleshooting/oom.md","ref":"troubleshooting/oom.md","articles":[]},{"title":"SemanticException Generate Map Join Task Error: Cannot serialize object","level":"1.5.2","depth":2,"path":"troubleshooting/mapjoin_task_error.md","ref":"troubleshooting/mapjoin_task_error.md","articles":[]},{"title":"Asterisk argument for UDTF does not work","level":"1.5.3","depth":2,"path":"troubleshooting/asterisk.md","ref":"troubleshooting/asterisk.md","articles":[]},{"title":"The number of mappers is less than input splits in Hadoop 2.x","level":"1.5.4","depth":2,"path":"troubleshooting/num_mappers.md","ref":"troubleshooting/num_mappers.md","articles":[]},{"title":"Map-side Join causes ClassCastException on Tez","level":"1.5.5","depth":2,"path":"troubleshooting/mapjoin_classcastex.md","ref":"troubleshooting/mapjoin_classcastex.md","articles":[]}]},"previous":{"title":"Adding rowid for each row","level":"1.4.1","depth":2,"path":"tips/rowid.md","ref":"tips/rowid.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.com/apache/incubator-hivemall/"},"splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"P DF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":true},"anchorjs":{"selector":"h1,h2,h3,*:not(.callout) > h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers" :true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User Manual for Apache Hivemall"},"file":{"path":"tips/hadoop_tuning.md","mtime":"2016-11-12T07:18:00.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"2016-11-14T10:40:22.987Z"},"basePath":"..","book":{"language":""}}); + gitbook.page.hasChanged({"page":{"title":"Hadoop tuning for Hivemall","level":"1.4.2","depth":2,"next":{"title":"Troubleshooting","level":"1.5","depth":1,"path":"troubleshooting/README.md","ref":"troubleshooting/README.md","articles":[{"title":"OutOfMemoryError in training","level":"1.5.1","depth":2,"path":"troubleshooting/oom.md","ref":"troubleshooting/oom.md","articles":[]},{"title":"SemanticException Generate Map Join Task Error: Cannot serialize object","level":"1.5.2","depth":2,"path":"troubleshooting/mapjoin_task_error.md","ref":"troubleshooting/mapjoin_task_error.md","articles":[]},{"title":"Asterisk argument for UDTF does not work","level":"1.5.3","depth":2,"path":"troubleshooting/asterisk.md","ref":"troubleshooting/asterisk.md","articles":[]},{"title":"The number of mappers is less than input splits in Hadoop 2.x","level":"1.5.4","depth":2,"path":"troubleshooting/num_mappers.md","ref":"troubleshooting/num_mappers.md","articles":[]},{"title":"Map-side Join causes ClassCastException on Tez","level":"1.5.5","depth":2,"path":"troubleshooting/mapjoin_classcastex.md","ref":"troubleshooting/mapjoin_classcastex.md","articles":[]}]},"previous":{"title":"Adding rowid for each row","level":"1.4.1","depth":2,"path":"tips/rowid.md","ref":"tips/rowid.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.com/apache/incubator-hivemall/"},"splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"P DF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":true},"anchorjs":{"selector":"h1,h2,h3,*:not(.callout) > h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers" :true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User Manual for Apache Hivemall"},"file":{"path":"tips/hadoop_tuning.md","mtime":"2016-11-17T12:14:00.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"2016-11-17T12:16:14.647Z"},"basePath":"..","book":{"language":""}}); }); </script> </div> http://git-wip-us.apache.org/repos/asf/incubator-hivemall-site/blob/68241a08/userguide/tips/index.html ---------------------------------------------------------------------- diff --git a/userguide/tips/index.html b/userguide/tips/index.html index d89990d..6eaf370 100644 --- a/userguide/tips/index.html +++ b/userguide/tips/index.html @@ -999,6 +999,21 @@ </li> + <li class="chapter " data-level="5.6" data-path="../binaryclass/titanic_rf.html"> + + <a href="../binaryclass/titanic_rf.html"> + + + <b>5.6.</b> + + Kaggle Titanic Tutorial + + </a> + + + + </li> + @@ -1649,7 +1664,25 @@ specific language governing permissions and limitations under the License. --> -<p><div id="page-footer"><hr><p><sub><font color="gray"> +<p><div id="page-footer"><hr><!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<p><sub><font color="gray"> Apache Hivemall is an effort undergoing incubation at The Apache Software Foundation (ASF), sponsored by the Apache Incubator. </font></sub></p> </div></p> @@ -1686,7 +1719,7 @@ Apache Hivemall is an effort undergoing incubation at The Apache Software Founda <script> var gitbook = gitbook || []; gitbook.push(function() { - gitbook.page.hasChanged({"page":{"title":"Tips for Effective Hivemall","level":"1.3","depth":1,"next":{"title":"Explicit addBias() for better prediction","level":"1.3.1","depth":2,"path":"tips/addbias.md","ref":"tips/addbias.md","articles":[]},"previous":{"title":"Input Format","level":"1.2.3","depth":2,"path":"getting_started/input-format.md","ref":"getting_started/input-format.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.com/apache/incubator-hivemall/"},"spli tter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":true},"anchorjs":{"selector":"h1,h2,h 3,*:not(.callout) > h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User Manual for Apache Hivemall"},"file":{"path":"tips/README.md","mtime":"2016-11-12T07:18:00.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"2016-11-14T10:40:22.987Z"},"basePath":"..","book":{"language":""}}); + gitbook.page.hasChanged({"page":{"title":"Tips for Effective Hivemall","level":"1.3","depth":1,"next":{"title":"Explicit addBias() for better prediction","level":"1.3.1","depth":2,"path":"tips/addbias.md","ref":"tips/addbias.md","articles":[]},"previous":{"title":"Input Format","level":"1.2.3","depth":2,"path":"getting_started/input-format.md","ref":"getting_started/input-format.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.com/apache/incubator-hivemall/"},"spli tter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":true},"anchorjs":{"selector":"h1,h2,h 3,*:not(.callout) > h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User Manual for Apache Hivemall"},"file":{"path":"tips/README.md","mtime":"2016-11-16T08:39:12.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"2016-11-17T12:16:14.647Z"},"basePath":"..","book":{"language":""}}); }); </script> </div> http://git-wip-us.apache.org/repos/asf/incubator-hivemall-site/blob/68241a08/userguide/tips/mixserver.html ---------------------------------------------------------------------- diff --git a/userguide/tips/mixserver.html b/userguide/tips/mixserver.html index c267485..69f4a94 100644 --- a/userguide/tips/mixserver.html +++ b/userguide/tips/mixserver.html @@ -999,6 +999,21 @@ </li> + <li class="chapter " data-level="5.6" data-path="../binaryclass/titanic_rf.html"> + + <a href="../binaryclass/titanic_rf.html"> + + + <b>5.6.</b> + + Kaggle Titanic Tutorial + + </a> + + + + </li> + @@ -1649,15 +1664,24 @@ specific language governing permissions and limitations under the License. --> -<p>In this page, we will explain how to use model mixing on Hivemall. The model mixing is useful for a better prediction performance and faster convergence in training classifiers. </p> -<!-- -You can find a brief explanation of the internal design of MIX protocol in [this slide](http://www.slideshare.net/myui/hivemall-mix-internal). ---> +<p>In this page, we will explain how to use model mixing on Hivemall. The model mixing is useful for a better prediction performance and faster convergence in training classifiers. +You can find a brief explanation of the internal design of MIX protocol in <a href="http://www.slideshare.net/myui/hivemall-mix-internal" target="_blank">this slide</a>.</p> +<!-- toc --><div id="toc" class="toc"> + +<ul> +<li><a href="#prerequisite">Prerequisite</a></li> +<li><a href="#running-mix-server">Running Mix Server</a></li> +<li><a href="#using-mix-protocol-through-hivemall">Using Mix Protocol through Hivemall</a></li> +<li><a href="#the-effect-of-model-mixing">The effect of model mixing</a></li> +</ul> + +</div><!-- tocstop --> <h1 id="prerequisite">Prerequisite</h1> <ul> -<li>Hivemall v0.3 or later</li> +<li><p>Hivemall v0.3 or later</p> +<p> We recommend to use Mixing in a cluster with fast networking. The current standard GbE is enough though.</p> +</li> </ul> -<p>We recommend to use Mixing in a cluster with fast networking. The current standard GbE is enough though.</p> <h1 id="running-mix-server">Running Mix Server</h1> <p>First, put the following files on server(s) that are accessible from Hadoop worker nodes:</p> <ul> @@ -1673,9 +1697,9 @@ The default port used by Mix server is 11212 and the port is configurable throug <p>See <a href="https://github.com/myui/hivemall/blob/master/mixserv/src/main/java/hivemall/mix/server/MixServer.java#L90" target="_blank">MixServer.java</a> to get detail of the Mix server options.</p> <p>We recommended to use multiple MIX servers to get better MIX throughput (3-5 or so would be enough for normal cluster size). The MIX protocol of Hivemall is <em>horizontally scalable</em> by adding MIX server nodes.</p> <h1 id="using-mix-protocol-through-hivemall">Using Mix Protocol through Hivemall</h1> -<p><a href="https://github.com/myui/hivemall/wiki/Installation" target="_blank">Install Hivemall</a> on Hive.</p> +<p><a href="../getting_started/installation.html">Install Hivemall</a> on Hive.</p> <p><em>Make sure that <a href="https://github.com/myui/hivemall/raw/master/target/hivemall-with-dependencies.jar" target="_blank">hivemall-with-dependencies.jar</a> is used for installation. The jar contains minimum requirement jars (netty,jsr305) for running Hivemall on Hive.</em></p> -<p>Now, we explain that how to use mixing in <a href="https://github.com/myui/hivemall/wiki/KDD2010a-classification" target="_blank">an example using KDD2010a dataset</a>.</p> +<p>Now, we explain that how to use mixing in <a href="../binaryclass/kdd2010a_dataset.html">an example using KDD2010a dataset</a>.</p> <p>Enabling the mixing on Hivemall is simple as follows:</p> <pre><code class="lang-sql"><span class="hljs-keyword">use</span> kdd2010; @@ -1694,8 +1718,26 @@ The default port used by Mix server is 11212 and the port is configurable throug <p>All you have to do is just adding "<em>-mix</em>" training option as seen in the above query.</p> <h1 id="the-effect-of-model-mixing">The effect of model mixing</h1> <p>In my experience, the MIX improved the prediction accuracy of the above KDD2010a PA1 training on a 32 nodes cluster from 0.844835019263103 (w/o mix) to 0.8678096499719774 (w/ mix).</p> -<p>The overhead of using the MIX protocol is <em>almost negligible</em> because the MIX communication is efficiently handled using asynchronous non-blocking I/O. Furthermore, the training time could be improved on certain settings because of the faster convergence due to mixing. -<div id="page-footer"><hr><p><sub><font color="gray"> +<p>The overhead of using the MIX protocol is <em>almost negligible</em> because the MIX communication is efficiently handled using asynchronous non-blocking I/O. Furthermore, the training time could be improved on certain settings because of the faster convergence due to mixing. +<div id="page-footer"><hr><!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<p><sub><font color="gray"> Apache Hivemall is an effort undergoing incubation at The Apache Software Foundation (ASF), sponsored by the Apache Incubator. </font></sub></p> </div></p> @@ -1732,7 +1774,7 @@ Apache Hivemall is an effort undergoing incubation at The Apache Software Founda <script> var gitbook = gitbook || []; gitbook.push(function() { - gitbook.page.hasChanged({"page":{"title":"Mixing models for a better prediction convergence (MIX server)","level":"1.3.5","depth":2,"next":{"title":"Run Hivemall on Amazon Elastic MapReduce","level":"1.3.6","depth":2,"path":"tips/emr.md","ref":"tips/emr.md","articles":[]},"previous":{"title":"Ensemble learning for stable prediction","level":"1.3.4","depth":2,"path":"tips/ensemble_learning.md","ref":"tips/ensemble_learning.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://g ithub.com/apache/incubator-hivemall/"},"splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"show Level":true},"anchorjs":{"selector":"h1,h2,h3,*:not(.callout) > h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User Manual for Apache Hivemall"},"file":{"path":"tips/mixserver.md","mtime":"2016-11-12T07:18:00.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"2016-11-14T10:40:22.987Z"},"basePath":"..","book":{"language":""}}); + gitbook.page.hasChanged({"page":{"title":"Mixing models for a better prediction convergence (MIX server)","level":"1.3.5","depth":2,"next":{"title":"Run Hivemall on Amazon Elastic MapReduce","level":"1.3.6","depth":2,"path":"tips/emr.md","ref":"tips/emr.md","articles":[]},"previous":{"title":"Ensemble learning for stable prediction","level":"1.3.4","depth":2,"path":"tips/ensemble_learning.md","ref":"tips/ensemble_learning.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://g ithub.com/apache/incubator-hivemall/"},"splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"show Level":true},"anchorjs":{"selector":"h1,h2,h3,*:not(.callout) > h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User Manual for Apache Hivemall"},"file":{"path":"tips/mixserver.md","mtime":"2016-11-17T12:10:19.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"2016-11-17T12:16:14.647Z"},"basePath":"..","book":{"language":""}}); }); </script> </div> http://git-wip-us.apache.org/repos/asf/incubator-hivemall-site/blob/68241a08/userguide/tips/rand_amplify.html ---------------------------------------------------------------------- diff --git a/userguide/tips/rand_amplify.html b/userguide/tips/rand_amplify.html index 5d550f5..8d3690b 100644 --- a/userguide/tips/rand_amplify.html +++ b/userguide/tips/rand_amplify.html @@ -999,6 +999,21 @@ </li> + <li class="chapter " data-level="5.6" data-path="../binaryclass/titanic_rf.html"> + + <a href="../binaryclass/titanic_rf.html"> + + + <b>5.6.</b> + + Kaggle Titanic Tutorial + + </a> + + + + </li> + @@ -1651,13 +1666,21 @@ --> <p>This article explains <em>amplify</em> technique that is useful for improving prediction score.</p> <p>Iterations are mandatory in machine learning (e.g., in <a href="http://en.wikipedia.org/wiki/Stochastic_gradient_descent" target="_blank">stochastic gradient descent</a>) to get good prediction models. However, MapReduce is known to be not suited for iterative algorithms because IN/OUT of each MapReduce job is through HDFS.</p> -<p>In this example, we show how Hivemall deals with this problem. We use <a href="https://github.com/myui/hivemall/wiki/KDDCup-2012-track-2-CTR-prediction-dataset" target="_blank">KDD Cup 2012, Track 2 Task</a> as an example.</p> -<p><strong>WARNING</strong>: rand_amplify() is supported in v0.2-beta1 and later.</p> +<p>In this example, we show how Hivemall deals with this problem. We use <a href="../regression/kddcup12tr2_dataset.html">KDD Cup 2012, Track 2 Task</a> as an example.</p> +<!-- toc --><div id="toc" class="toc"> + +<ul> +<li><a href="#amplify-training-examples-in-map-phase-and-shuffle-them-in-reduce-phase">Amplify training examples in Map phase and shuffle them in Reduce phase</a></li> +<li><a href="#amplify-and-shuffle-training-examples-in-each-map-task">Amplify and shuffle training examples in each Map task</a></li> +<li><a href="#conclusion">Conclusion</a></li> +</ul> + +</div><!-- tocstop --> <hr> <h1 id="amplify-training-examples-in-map-phase-and-shuffle-them-in-reduce-phase">Amplify training examples in Map phase and shuffle them in Reduce phase</h1> <p>Hivemall provides the <strong>amplify</strong> UDTF to enumerate iteration effects in machine learning without several MapReduce steps. </p> <p>The amplify function returns multiple rows for each row. -The first argument ${xtimes} is the multiplication factor.<br>In the following examples, the multiplication factor is set to 3.</p> +The first argument <code>${xtimes}</code> is the multiplication factor.<br>In the following examples, the multiplication factor is set to 3.</p> <pre><code class="lang-sql"><span class="hljs-keyword">set</span> hivevar:xtimes=<span class="hljs-number">3</span>; <span class="hljs-keyword">create</span> <span class="hljs-keyword">or</span> <span class="hljs-keyword">replace</span> <span class="hljs-keyword">view</span> training_x3 @@ -1690,8 +1713,8 @@ So, we recommend users to use an amplified view for training as follows:</p> </code></pre> <p>The above query is executed by 2 MapReduce jobs as shown below: <img src="../resources/images/amplify.png" alt="amplifier"></p> -<p>Using <em>trainning_x3</em> instead of the plain training table results in higher and better AUC (0.746214) in <a href="https://github.com/myui/hivemall/wiki/KDDCup-2012-track-2-CTR-prediction-(regression\" target="_blank">this</a>) example.</p> -<p>A problem in amplify() is that the shuffle (copy) and merge phase of the stage 1 could become a bottleneck. +<p>Using <em>trainning_x3</em> instead of the plain training table results in higher and better AUC (0.746214) in <a href="../regression/kddcup12tr2_lr_amplify.html#conclusion">this example</a>.</p> +<p>A problem in <code>amplify()</code> is that the shuffle (copy) and merge phase of the stage 1 could become a bottleneck. When the training table is so large that involves 100 Map tasks, the merge operator needs to merge at least 100 files by (external) merge sort! </p> <p>Note that the actual bottleneck is not M/R iterations but shuffling training instance. Iteration without shuffling (as in <a href="http://spark.incubator.apache.org/examples.html" target="_blank">the Spark example</a>) causes very slow convergence and results in requiring more iterations. Shuffling cannot be avoided even in iterative MapReduce variants.</p> <p><img src="../resources/images/amplify_elapsed.png" alt="amplify_elapsed"></p> @@ -1713,7 +1736,7 @@ The rand_amplify UDTF outputs rows in a random order when the local buffer speci <p><img src="../resources/images/randamplify.png" alt="randamplify"></p> <p>The map-local multiplication and shuffling has no bottleneck in the merge phase and the query is efficiently executed within a single MapReduce job.</p> <p><img src="../resources/images/randamplify_elapsed.png" alt="randamplify_elapsed"></p> -<p>Using <em>rand_amplify</em> results in a better AUC (0.743392) in <a href="https://github.com/myui/hivemall/wiki/KDDCup-2012-track-2-CTR-prediction-(regression\" target="_blank">this</a>) example.</p> +<p>Using <em>rand_amplify</em> results in a better AUC (0.743392) in <a href="../regression/kddcup12tr2_lr_amplify.html#conclusion">this example</a>.</p> <hr> <h1 id="conclusion">Conclusion</h1> <p>We recommend users to use <em>amplify()</em> for small training inputs and to use <em>rand_amplify()</em> for large training inputs to get a better accuracy in a reasonable training time.</p> @@ -1743,7 +1766,25 @@ The rand_amplify UDTF outputs rows in a random order when the local buffer speci </tr> </tbody> </table> -<p><div id="page-footer"><hr><p><sub><font color="gray"> +<p><div id="page-footer"><hr><!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<p><sub><font color="gray"> Apache Hivemall is an effort undergoing incubation at The Apache Software Foundation (ASF), sponsored by the Apache Incubator. </font></sub></p> </div></p> @@ -1780,7 +1821,7 @@ Apache Hivemall is an effort undergoing incubation at The Apache Software Founda <script> var gitbook = gitbook || []; gitbook.push(function() { - gitbook.page.hasChanged({"page":{"title":"Use rand_amplify() to better prediction results","level":"1.3.2","depth":2,"next":{"title":"Real-time Prediction on RDBMS","level":"1.3.3","depth":2,"path":"tips/rt_prediction.md","ref":"tips/rt_prediction.md","articles":[]},"previous":{"title":"Explicit addBias() for better prediction","level":"1.3.1","depth":2,"path":"tips/addbias.md","ref":"tips/addbias.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.com/apache/incubato r-hivemall/"},"splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":true},"anchorjs":{ "selector":"h1,h2,h3,*:not(.callout) > h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User Manual for Apache Hivemall"},"file":{"path":"tips/rand_amplify.md","mtime":"2016-11-14T09:51:23.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"2016-11-14T10:40:22.987Z"},"basePath":"..","book":{"language":""}}); + gitbook.page.hasChanged({"page":{"title":"Use rand_amplify() to better prediction results","level":"1.3.2","depth":2,"next":{"title":"Real-time Prediction on RDBMS","level":"1.3.3","depth":2,"path":"tips/rt_prediction.md","ref":"tips/rt_prediction.md","articles":[]},"previous":{"title":"Explicit addBias() for better prediction","level":"1.3.1","depth":2,"path":"tips/addbias.md","ref":"tips/addbias.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.com/apache/incubato r-hivemall/"},"splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":true},"anchorjs":{ "selector":"h1,h2,h3,*:not(.callout) > h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User Manual for Apache Hivemall"},"file":{"path":"tips/rand_amplify.md","mtime":"2016-11-17T11:44:53.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"2016-11-17T12:16:14.647Z"},"basePath":"..","book":{"language":""}}); }); </script> </div>
