http://git-wip-us.apache.org/repos/asf/incubator-hivemall-site/blob/ba518dab/userguide/clustering/plsa.html ---------------------------------------------------------------------- diff --git a/userguide/clustering/plsa.html b/userguide/clustering/plsa.html index 2ab3364..51555fb 100644 --- a/userguide/clustering/plsa.html +++ b/userguide/clustering/plsa.html @@ -598,14 +598,30 @@ </li> - <li class="chapter " data-level="3.5" data-path="../ft_engineering/tfidf.html"> + <li class="chapter " data-level="3.5" data-path="../ft_engineering/pairing.html"> - <a href="../ft_engineering/tfidf.html"> + <a href="../ft_engineering/pairing.html"> <b>3.5.</b> - TF-IDF Calculation + FEATURE PAIRING + + </a> + + + + <ul class="articles"> + + + <li class="chapter " data-level="3.5.1" data-path="../ft_engineering/polynomial.html"> + + <a href="../ft_engineering/polynomial.html"> + + + <b>3.5.1.</b> + + Polynomial Features </a> @@ -613,6 +629,11 @@ </li> + + </ul> + + </li> + <li class="chapter " data-level="3.6" data-path="../ft_engineering/ft_trans.html"> <a href="../ft_engineering/ft_trans.html"> @@ -664,6 +685,21 @@ </li> + <li class="chapter " data-level="3.7" data-path="../ft_engineering/tfidf.html"> + + <a href="../ft_engineering/tfidf.html"> + + + <b>3.7.</b> + + TF-IDF Calculation + + </a> + + + + </li> + @@ -761,7 +797,7 @@ - <li class="header">Part V - Prediction</li> + <li class="header">Part V - Supervised Learning</li> @@ -780,27 +816,19 @@ </li> - <li class="chapter " data-level="5.2" data-path="../regression/general.html"> - - <a href="../regression/general.html"> - - - <b>5.2.</b> - - Regression - - </a> - - - </li> - <li class="chapter " data-level="5.3" data-path="../binaryclass/general.html"> + + <li class="header">Part VI - Binary classification</li> + + + + <li class="chapter " data-level="6.1" data-path="../binaryclass/general.html"> <a href="../binaryclass/general.html"> - <b>5.3.</b> + <b>6.1.</b> Binary Classification @@ -810,21 +838,14 @@ </li> - - - - <li class="header">Part VI - Binary classification tutorials</li> - - - - <li class="chapter " data-level="6.1" data-path="../binaryclass/a9a.html"> + <li class="chapter " data-level="6.2" data-path="../binaryclass/a9a.html"> <a href="../binaryclass/a9a.html"> - <b>6.1.</b> + <b>6.2.</b> - a9a + a9a tutorial </a> @@ -833,12 +854,12 @@ <ul class="articles"> - <li class="chapter " data-level="6.1.1" data-path="../binaryclass/a9a_dataset.html"> + <li class="chapter " data-level="6.2.1" data-path="../binaryclass/a9a_dataset.html"> <a href="../binaryclass/a9a_dataset.html"> - <b>6.1.1.</b> + <b>6.2.1.</b> Data preparation @@ -848,12 +869,12 @@ </li> - <li class="chapter " data-level="6.1.2" data-path="../binaryclass/a9a_lr.html"> + <li class="chapter " data-level="6.2.2" data-path="../binaryclass/a9a_lr.html"> <a href="../binaryclass/a9a_lr.html"> - <b>6.1.2.</b> + <b>6.2.2.</b> Logistic Regression @@ -863,12 +884,12 @@ </li> - <li class="chapter " data-level="6.1.3" data-path="../binaryclass/a9a_minibatch.html"> + <li class="chapter " data-level="6.2.3" data-path="../binaryclass/a9a_minibatch.html"> <a href="../binaryclass/a9a_minibatch.html"> - <b>6.1.3.</b> + <b>6.2.3.</b> Mini-batch Gradient Descent @@ -883,14 +904,14 @@ </li> - <li class="chapter " data-level="6.2" data-path="../binaryclass/news20.html"> + <li class="chapter " data-level="6.3" data-path="../binaryclass/news20.html"> <a href="../binaryclass/news20.html"> - <b>6.2.</b> + <b>6.3.</b> - News20 + News20 tutorial </a> @@ -899,12 +920,12 @@ <ul class="articles"> - <li class="chapter " data-level="6.2.1" data-path="../binaryclass/news20_dataset.html"> + <li class="chapter " data-level="6.3.1" data-path="../binaryclass/news20_dataset.html"> <a href="../binaryclass/news20_dataset.html"> - <b>6.2.1.</b> + <b>6.3.1.</b> Data preparation @@ -914,12 +935,12 @@ </li> - <li class="chapter " data-level="6.2.2" data-path="../binaryclass/news20_pa.html"> + <li class="chapter " data-level="6.3.2" data-path="../binaryclass/news20_pa.html"> <a href="../binaryclass/news20_pa.html"> - <b>6.2.2.</b> + <b>6.3.2.</b> Perceptron, Passive Aggressive @@ -929,12 +950,12 @@ </li> - <li class="chapter " data-level="6.2.3" data-path="../binaryclass/news20_scw.html"> + <li class="chapter " data-level="6.3.3" data-path="../binaryclass/news20_scw.html"> <a href="../binaryclass/news20_scw.html"> - <b>6.2.3.</b> + <b>6.3.3.</b> CW, AROW, SCW @@ -944,12 +965,12 @@ </li> - <li class="chapter " data-level="6.2.4" data-path="../binaryclass/news20_adagrad.html"> + <li class="chapter " data-level="6.3.4" data-path="../binaryclass/news20_adagrad.html"> <a href="../binaryclass/news20_adagrad.html"> - <b>6.2.4.</b> + <b>6.3.4.</b> AdaGradRDA, AdaGrad, AdaDelta @@ -964,14 +985,14 @@ </li> - <li class="chapter " data-level="6.3" data-path="../binaryclass/kdd2010a.html"> + <li class="chapter " data-level="6.4" data-path="../binaryclass/kdd2010a.html"> <a href="../binaryclass/kdd2010a.html"> - <b>6.3.</b> + <b>6.4.</b> - KDD2010a + KDD2010a tutorial </a> @@ -980,12 +1001,12 @@ <ul class="articles"> - <li class="chapter " data-level="6.3.1" data-path="../binaryclass/kdd2010a_dataset.html"> + <li class="chapter " data-level="6.4.1" data-path="../binaryclass/kdd2010a_dataset.html"> <a href="../binaryclass/kdd2010a_dataset.html"> - <b>6.3.1.</b> + <b>6.4.1.</b> Data preparation @@ -995,12 +1016,12 @@ </li> - <li class="chapter " data-level="6.3.2" data-path="../binaryclass/kdd2010a_scw.html"> + <li class="chapter " data-level="6.4.2" data-path="../binaryclass/kdd2010a_scw.html"> <a href="../binaryclass/kdd2010a_scw.html"> - <b>6.3.2.</b> + <b>6.4.2.</b> PA, CW, AROW, SCW @@ -1015,14 +1036,14 @@ </li> - <li class="chapter " data-level="6.4" data-path="../binaryclass/kdd2010b.html"> + <li class="chapter " data-level="6.5" data-path="../binaryclass/kdd2010b.html"> <a href="../binaryclass/kdd2010b.html"> - <b>6.4.</b> + <b>6.5.</b> - KDD2010b + KDD2010b tutorial </a> @@ -1031,12 +1052,12 @@ <ul class="articles"> - <li class="chapter " data-level="6.4.1" data-path="../binaryclass/kdd2010b_dataset.html"> + <li class="chapter " data-level="6.5.1" data-path="../binaryclass/kdd2010b_dataset.html"> <a href="../binaryclass/kdd2010b_dataset.html"> - <b>6.4.1.</b> + <b>6.5.1.</b> Data preparation @@ -1046,12 +1067,12 @@ </li> - <li class="chapter " data-level="6.4.2" data-path="../binaryclass/kdd2010b_arow.html"> + <li class="chapter " data-level="6.5.2" data-path="../binaryclass/kdd2010b_arow.html"> <a href="../binaryclass/kdd2010b_arow.html"> - <b>6.4.2.</b> + <b>6.5.2.</b> AROW @@ -1066,14 +1087,14 @@ </li> - <li class="chapter " data-level="6.5" data-path="../binaryclass/webspam.html"> + <li class="chapter " data-level="6.6" data-path="../binaryclass/webspam.html"> <a href="../binaryclass/webspam.html"> - <b>6.5.</b> + <b>6.6.</b> - Webspam + Webspam tutorial </a> @@ -1082,12 +1103,12 @@ <ul class="articles"> - <li class="chapter " data-level="6.5.1" data-path="../binaryclass/webspam_dataset.html"> + <li class="chapter " data-level="6.6.1" data-path="../binaryclass/webspam_dataset.html"> <a href="../binaryclass/webspam_dataset.html"> - <b>6.5.1.</b> + <b>6.6.1.</b> Data pareparation @@ -1097,12 +1118,12 @@ </li> - <li class="chapter " data-level="6.5.2" data-path="../binaryclass/webspam_scw.html"> + <li class="chapter " data-level="6.6.2" data-path="../binaryclass/webspam_scw.html"> <a href="../binaryclass/webspam_scw.html"> - <b>6.5.2.</b> + <b>6.6.2.</b> PA1, AROW, SCW @@ -1117,14 +1138,14 @@ </li> - <li class="chapter " data-level="6.6" data-path="../binaryclass/titanic_rf.html"> + <li class="chapter " data-level="6.7" data-path="../binaryclass/titanic_rf.html"> <a href="../binaryclass/titanic_rf.html"> - <b>6.6.</b> + <b>6.7.</b> - Kaggle Titanic + Kaggle Titanic tutorial </a> @@ -1135,7 +1156,7 @@ - <li class="header">Part VII - Multiclass classification tutorials</li> + <li class="header">Part VII - Multiclass classification</li> @@ -1146,7 +1167,7 @@ <b>7.1.</b> - News20 Multiclass + News20 Multiclass tutorial </a> @@ -1257,7 +1278,7 @@ <b>7.2.</b> - Iris + Iris tutorial </a> @@ -1319,18 +1340,33 @@ - <li class="header">Part VIII - Regression tutorials</li> + <li class="header">Part VIII - Regression</li> - <li class="chapter " data-level="8.1" data-path="../regression/e2006.html"> + <li class="chapter " data-level="8.1" data-path="../regression/general.html"> - <a href="../regression/e2006.html"> + <a href="../regression/general.html"> <b>8.1.</b> - E2006-tfidf regression + Regression + + </a> + + + + </li> + + <li class="chapter " data-level="8.2" data-path="../regression/e2006.html"> + + <a href="../regression/e2006.html"> + + + <b>8.2.</b> + + E2006-tfidf regression tutorial </a> @@ -1339,12 +1375,12 @@ <ul class="articles"> - <li class="chapter " data-level="8.1.1" data-path="../regression/e2006_dataset.html"> + <li class="chapter " data-level="8.2.1" data-path="../regression/e2006_dataset.html"> <a href="../regression/e2006_dataset.html"> - <b>8.1.1.</b> + <b>8.2.1.</b> Data preparation @@ -1354,12 +1390,12 @@ </li> - <li class="chapter " data-level="8.1.2" data-path="../regression/e2006_arow.html"> + <li class="chapter " data-level="8.2.2" data-path="../regression/e2006_arow.html"> <a href="../regression/e2006_arow.html"> - <b>8.1.2.</b> + <b>8.2.2.</b> Passive Aggressive, AROW @@ -1374,14 +1410,14 @@ </li> - <li class="chapter " data-level="8.2" data-path="../regression/kddcup12tr2.html"> + <li class="chapter " data-level="8.3" data-path="../regression/kddcup12tr2.html"> <a href="../regression/kddcup12tr2.html"> - <b>8.2.</b> + <b>8.3.</b> - KDDCup 2012 track 2 CTR prediction + KDDCup 2012 track 2 CTR prediction tutorial </a> @@ -1390,12 +1426,12 @@ <ul class="articles"> - <li class="chapter " data-level="8.2.1" data-path="../regression/kddcup12tr2_dataset.html"> + <li class="chapter " data-level="8.3.1" data-path="../regression/kddcup12tr2_dataset.html"> <a href="../regression/kddcup12tr2_dataset.html"> - <b>8.2.1.</b> + <b>8.3.1.</b> Data preparation @@ -1405,12 +1441,12 @@ </li> - <li class="chapter " data-level="8.2.2" data-path="../regression/kddcup12tr2_lr.html"> + <li class="chapter " data-level="8.3.2" data-path="../regression/kddcup12tr2_lr.html"> <a href="../regression/kddcup12tr2_lr.html"> - <b>8.2.2.</b> + <b>8.3.2.</b> Logistic Regression, Passive Aggressive @@ -1420,12 +1456,12 @@ </li> - <li class="chapter " data-level="8.2.3" data-path="../regression/kddcup12tr2_lr_amplify.html"> + <li class="chapter " data-level="8.3.3" data-path="../regression/kddcup12tr2_lr_amplify.html"> <a href="../regression/kddcup12tr2_lr_amplify.html"> - <b>8.2.3.</b> + <b>8.3.3.</b> Logistic Regression with Amplifier @@ -1435,12 +1471,12 @@ </li> - <li class="chapter " data-level="8.2.4" data-path="../regression/kddcup12tr2_adagrad.html"> + <li class="chapter " data-level="8.3.4" data-path="../regression/kddcup12tr2_adagrad.html"> <a href="../regression/kddcup12tr2_adagrad.html"> - <b>8.2.4.</b> + <b>8.3.4.</b> AdaGrad, AdaDelta @@ -2124,19 +2160,23 @@ <span class="hljs-keyword">select</span> docid, feature(word, <span class="hljs-keyword">count</span>(word)) <span class="hljs-keyword">as</span> f - <span class="hljs-keyword">from</span> docs t1 lateral <span class="hljs-keyword">view</span> explode(tokenize(doc, <span class="hljs-literal">true</span>)) t2 <span class="hljs-keyword">as</span> word + <span class="hljs-keyword">from</span> + docs t1 + lateral <span class="hljs-keyword">view</span> explode(tokenize(doc, <span class="hljs-literal">true</span>)) t2 <span class="hljs-keyword">as</span> word <span class="hljs-keyword">where</span> <span class="hljs-keyword">not</span> is_stopword(word) <span class="hljs-keyword">group</span> <span class="hljs-keyword">by</span> docid, word -) -<span class="hljs-keyword">select</span> - train_plsa(feature, <span class="hljs-string">"-topics 2 -eps 0.00001 -iter 2048 -alpha 0.01"</span>) <span class="hljs-keyword">as</span> (label, word, prob) -<span class="hljs-keyword">from</span> ( - <span class="hljs-keyword">select</span> docid, collect_set(f) <span class="hljs-keyword">as</span> feature +), +<span class="hljs-keyword">input</span> <span class="hljs-keyword">as</span> ( + <span class="hljs-keyword">select</span> docid, collect_list(f) <span class="hljs-keyword">as</span> features <span class="hljs-keyword">from</span> word_counts <span class="hljs-keyword">group</span> <span class="hljs-keyword">by</span> docid -) t +) +<span class="hljs-keyword">select</span> + train_plsa(features, <span class="hljs-string">'-topics 2 -eps 0.00001 -iter 2048 -alpha 0.01'</span>) <span class="hljs-keyword">as</span> (label, word, prob) +<span class="hljs-keyword">from</span> + <span class="hljs-keyword">input</span> ; </code></pre> <table> @@ -2246,7 +2286,9 @@ docid, word, <span class="hljs-keyword">count</span>(word) <span class="hljs-keyword">as</span> <span class="hljs-keyword">value</span> - <span class="hljs-keyword">from</span> docs t1 LATERAL <span class="hljs-keyword">VIEW</span> explode(tokenize(doc, <span class="hljs-literal">true</span>)) t2 <span class="hljs-keyword">as</span> word + <span class="hljs-keyword">from</span> + docs t1 + LATERAL <span class="hljs-keyword">VIEW</span> explode(tokenize(doc, <span class="hljs-literal">true</span>)) t2 <span class="hljs-keyword">as</span> word <span class="hljs-keyword">where</span> <span class="hljs-keyword">not</span> is_stopword(word) <span class="hljs-keyword">group</span> <span class="hljs-keyword">by</span> @@ -2255,20 +2297,25 @@ topic <span class="hljs-keyword">as</span> ( <span class="hljs-keyword">select</span> t.docid, - plsa_predict(t.word, t.<span class="hljs-keyword">value</span>, m.label, m.prob, <span class="hljs-string">"-topics 2"</span>) <span class="hljs-keyword">as</span> probabilities + plsa_predict(t.word, t.<span class="hljs-keyword">value</span>, m.label, m.prob, <span class="hljs-string">'-topics 2'</span>) <span class="hljs-keyword">as</span> probabilities <span class="hljs-keyword">from</span> <span class="hljs-keyword">test</span> t <span class="hljs-keyword">JOIN</span> plsa_model m <span class="hljs-keyword">ON</span> (t.word = m.word) <span class="hljs-keyword">group</span> <span class="hljs-keyword">by</span> t.docid ) -<span class="hljs-keyword">select</span> docid, probabilities, probabilities[<span class="hljs-number">0</span>].label, m.words <span class="hljs-comment">-- topic each document should be assigned</span> -<span class="hljs-keyword">from</span> topic t -<span class="hljs-keyword">join</span> ( - <span class="hljs-keyword">select</span> label, collect_set(feature(word, prob)) <span class="hljs-keyword">as</span> words - <span class="hljs-keyword">from</span> plsa_model - <span class="hljs-keyword">group</span> <span class="hljs-keyword">by</span> label -) m <span class="hljs-keyword">on</span> t.probabilities[<span class="hljs-number">0</span>].label = m.label +<span class="hljs-keyword">select</span> + docid, + probabilities, + probabilities[<span class="hljs-number">0</span>].label, + m.words <span class="hljs-comment">-- topic each document should be assigned</span> +<span class="hljs-keyword">from</span> + topic t + <span class="hljs-keyword">JOIN</span> ( + <span class="hljs-keyword">select</span> label, collect_list(feature(word, prob)) <span class="hljs-keyword">as</span> words + <span class="hljs-keyword">from</span> plsa_model + <span class="hljs-keyword">group</span> <span class="hljs-keyword">by</span> label + ) m <span class="hljs-keyword">on</span> t.probabilities[<span class="hljs-number">0</span>].label = m.label ; </code></pre> <table> @@ -2301,7 +2348,7 @@ topic <span class="hljs-keyword">as</span> ( <p>For the reasons that we mentioned above, we recommend you to first use LDA. After that, if you encountered problems such as slow running time and undesirable clustering results, let you try alternative pLSA approach.</p> <h1 id="setting-hyper-parameter-alpha">Setting hyper-parameter <code>alpha</code></h1> <p>For training pLSA, we set a hyper-parameter <code>alpha</code> in the above example:</p> -<pre><code class="lang-sql"><span class="hljs-keyword">SELECT</span> train_plsa(feature, <span class="hljs-string">"-topics 2 -eps 0.00001 -iter 2048 -alpha 0.01"</span>) +<pre><code class="lang-sql"><span class="hljs-keyword">SELECT</span> train_plsa(feature, <span class="hljs-string">'-topics 2 -eps 0.00001 -iter 2048 -alpha 0.01'</span>) </code></pre> <p>This value controls <strong>how much iterative model update is affected by the old results</strong>.</p> <p>From an algorithmic point of view, training pLSA (and LDA) iteratively repeats certain operations and updates the target value (i.e., probability obtained as a result of <code>train_plsa()</code>). This iterative procedure gradually makes the probabilities more accurate. What <code>alpha</code> does is to control the degree of the change of probabilities in each step.</p> @@ -2309,7 +2356,7 @@ topic <span class="hljs-keyword">as</span> ( <pre><code>Perplexity would be Infinity. Try different mini-batch size `-s`, larger `-delta` and/or larger `-alpha`. </code></pre><p>In that case, you need to try different hyper-parameters to avoid overfitting as the exception suggests.</p> <p>For instance, <a href="http://qwone.com/~jason/20Newsgroups/" target="_blank">20 newsgroups dataset</a> which consists of 10906 realistic documents empirically requires the following options:</p> -<pre><code class="lang-sql"><span class="hljs-keyword">SELECT</span> train_plsa(features, <span class="hljs-string">"-topics 20 -iter 10 -s 128 -delta 0.01 -alpha 512 -eps 0.1"</span>) +<pre><code class="lang-sql"><span class="hljs-keyword">SELECT</span> train_plsa(features, <span class="hljs-string">'-topics 20 -iter 10 -s 128 -delta 0.01 -alpha 512 -eps 0.1'</span>) </code></pre> <p>Clearly, <code>alpha</code> is much larger than <code>0.01</code> which was used for the dummy data above. Let you keep in mind that an appropriate value of <code>alpha</code> highly depends on the number of documents and mini-batch size. <div id="page-footer" class="localized-footer"><hr><!-- @@ -2367,7 +2414,7 @@ Apache Hivemall is an effort undergoing incubation at The Apache Software Founda <script> var gitbook = gitbook || []; gitbook.push(function() { - gitbook.page.hasChanged({"page":{"title":"Probabilistic Latent Semantic Analysis","level":"11.2","depth":1,"next":{"title":"Lat/Lon functions","level":"12.1","depth":1,"path":"geospatial/latlon.md","ref":"geospatial/latlon.md","articles":[]},"previous":{"title":"Latent Dirichlet Allocation","level":"11.1","depth":1,"path":"clustering/lda.md","ref":"clustering/lda.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"h2lb":3,"header":1,"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.com/apache/incubator-hivemall/"}," splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md","hline":"true"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":true},"anchorjs":{ "selector":"h1,h2,h3,*:not(.callout) > h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User Manual for Apache Hivemall"},"file":{"path":"clustering/plsa.md","mtime":"2017-05-30T05:53:27.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"2017-06-15T10:33:21.138Z"},"basePath":"..","book":{"language":""}}); + gitbook.page.hasChanged({"page":{"title":"Probabilistic Latent Semantic Analysis","level":"11.2","depth":1,"next":{"title":"Lat/Lon functions","level":"12.1","depth":1,"path":"geospatial/latlon.md","ref":"geospatial/latlon.md","articles":[]},"previous":{"title":"Latent Dirichlet Allocation","level":"11.1","depth":1,"path":"clustering/lda.md","ref":"clustering/lda.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"h2lb":3,"header":1,"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.com/apache/incubator-hivemall/"}," splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md","hline":"true"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":true},"anchorjs":{ "selector":"h1,h2,h3,*:not(.callout) > h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User Manual for Apache Hivemall"},"file":{"path":"clustering/plsa.md","mtime":"2017-06-23T09:56:22.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"2017-06-23T09:59:20.878Z"},"basePath":"..","book":{"language":""}}); }); </script> </div>
http://git-wip-us.apache.org/repos/asf/incubator-hivemall-site/blob/ba518dab/userguide/docker/getting_started.html ---------------------------------------------------------------------- diff --git a/userguide/docker/getting_started.html b/userguide/docker/getting_started.html index cbfa5c5..f9cadba 100644 --- a/userguide/docker/getting_started.html +++ b/userguide/docker/getting_started.html @@ -596,14 +596,30 @@ </li> - <li class="chapter " data-level="3.5" data-path="../ft_engineering/tfidf.html"> + <li class="chapter " data-level="3.5" data-path="../ft_engineering/pairing.html"> - <a href="../ft_engineering/tfidf.html"> + <a href="../ft_engineering/pairing.html"> <b>3.5.</b> - TF-IDF Calculation + FEATURE PAIRING + + </a> + + + + <ul class="articles"> + + + <li class="chapter " data-level="3.5.1" data-path="../ft_engineering/polynomial.html"> + + <a href="../ft_engineering/polynomial.html"> + + + <b>3.5.1.</b> + + Polynomial Features </a> @@ -611,6 +627,11 @@ </li> + + </ul> + + </li> + <li class="chapter " data-level="3.6" data-path="../ft_engineering/ft_trans.html"> <a href="../ft_engineering/ft_trans.html"> @@ -662,6 +683,21 @@ </li> + <li class="chapter " data-level="3.7" data-path="../ft_engineering/tfidf.html"> + + <a href="../ft_engineering/tfidf.html"> + + + <b>3.7.</b> + + TF-IDF Calculation + + </a> + + + + </li> + @@ -759,7 +795,7 @@ - <li class="header">Part V - Prediction</li> + <li class="header">Part V - Supervised Learning</li> @@ -778,27 +814,19 @@ </li> - <li class="chapter " data-level="5.2" data-path="../regression/general.html"> - - <a href="../regression/general.html"> - - - <b>5.2.</b> - - Regression - - </a> - - - </li> - <li class="chapter " data-level="5.3" data-path="../binaryclass/general.html"> + + <li class="header">Part VI - Binary classification</li> + + + + <li class="chapter " data-level="6.1" data-path="../binaryclass/general.html"> <a href="../binaryclass/general.html"> - <b>5.3.</b> + <b>6.1.</b> Binary Classification @@ -808,21 +836,14 @@ </li> - - - - <li class="header">Part VI - Binary classification tutorials</li> - - - - <li class="chapter " data-level="6.1" data-path="../binaryclass/a9a.html"> + <li class="chapter " data-level="6.2" data-path="../binaryclass/a9a.html"> <a href="../binaryclass/a9a.html"> - <b>6.1.</b> + <b>6.2.</b> - a9a + a9a tutorial </a> @@ -831,12 +852,12 @@ <ul class="articles"> - <li class="chapter " data-level="6.1.1" data-path="../binaryclass/a9a_dataset.html"> + <li class="chapter " data-level="6.2.1" data-path="../binaryclass/a9a_dataset.html"> <a href="../binaryclass/a9a_dataset.html"> - <b>6.1.1.</b> + <b>6.2.1.</b> Data preparation @@ -846,12 +867,12 @@ </li> - <li class="chapter " data-level="6.1.2" data-path="../binaryclass/a9a_lr.html"> + <li class="chapter " data-level="6.2.2" data-path="../binaryclass/a9a_lr.html"> <a href="../binaryclass/a9a_lr.html"> - <b>6.1.2.</b> + <b>6.2.2.</b> Logistic Regression @@ -861,12 +882,12 @@ </li> - <li class="chapter " data-level="6.1.3" data-path="../binaryclass/a9a_minibatch.html"> + <li class="chapter " data-level="6.2.3" data-path="../binaryclass/a9a_minibatch.html"> <a href="../binaryclass/a9a_minibatch.html"> - <b>6.1.3.</b> + <b>6.2.3.</b> Mini-batch Gradient Descent @@ -881,14 +902,14 @@ </li> - <li class="chapter " data-level="6.2" data-path="../binaryclass/news20.html"> + <li class="chapter " data-level="6.3" data-path="../binaryclass/news20.html"> <a href="../binaryclass/news20.html"> - <b>6.2.</b> + <b>6.3.</b> - News20 + News20 tutorial </a> @@ -897,12 +918,12 @@ <ul class="articles"> - <li class="chapter " data-level="6.2.1" data-path="../binaryclass/news20_dataset.html"> + <li class="chapter " data-level="6.3.1" data-path="../binaryclass/news20_dataset.html"> <a href="../binaryclass/news20_dataset.html"> - <b>6.2.1.</b> + <b>6.3.1.</b> Data preparation @@ -912,12 +933,12 @@ </li> - <li class="chapter " data-level="6.2.2" data-path="../binaryclass/news20_pa.html"> + <li class="chapter " data-level="6.3.2" data-path="../binaryclass/news20_pa.html"> <a href="../binaryclass/news20_pa.html"> - <b>6.2.2.</b> + <b>6.3.2.</b> Perceptron, Passive Aggressive @@ -927,12 +948,12 @@ </li> - <li class="chapter " data-level="6.2.3" data-path="../binaryclass/news20_scw.html"> + <li class="chapter " data-level="6.3.3" data-path="../binaryclass/news20_scw.html"> <a href="../binaryclass/news20_scw.html"> - <b>6.2.3.</b> + <b>6.3.3.</b> CW, AROW, SCW @@ -942,12 +963,12 @@ </li> - <li class="chapter " data-level="6.2.4" data-path="../binaryclass/news20_adagrad.html"> + <li class="chapter " data-level="6.3.4" data-path="../binaryclass/news20_adagrad.html"> <a href="../binaryclass/news20_adagrad.html"> - <b>6.2.4.</b> + <b>6.3.4.</b> AdaGradRDA, AdaGrad, AdaDelta @@ -962,14 +983,14 @@ </li> - <li class="chapter " data-level="6.3" data-path="../binaryclass/kdd2010a.html"> + <li class="chapter " data-level="6.4" data-path="../binaryclass/kdd2010a.html"> <a href="../binaryclass/kdd2010a.html"> - <b>6.3.</b> + <b>6.4.</b> - KDD2010a + KDD2010a tutorial </a> @@ -978,12 +999,12 @@ <ul class="articles"> - <li class="chapter " data-level="6.3.1" data-path="../binaryclass/kdd2010a_dataset.html"> + <li class="chapter " data-level="6.4.1" data-path="../binaryclass/kdd2010a_dataset.html"> <a href="../binaryclass/kdd2010a_dataset.html"> - <b>6.3.1.</b> + <b>6.4.1.</b> Data preparation @@ -993,12 +1014,12 @@ </li> - <li class="chapter " data-level="6.3.2" data-path="../binaryclass/kdd2010a_scw.html"> + <li class="chapter " data-level="6.4.2" data-path="../binaryclass/kdd2010a_scw.html"> <a href="../binaryclass/kdd2010a_scw.html"> - <b>6.3.2.</b> + <b>6.4.2.</b> PA, CW, AROW, SCW @@ -1013,14 +1034,14 @@ </li> - <li class="chapter " data-level="6.4" data-path="../binaryclass/kdd2010b.html"> + <li class="chapter " data-level="6.5" data-path="../binaryclass/kdd2010b.html"> <a href="../binaryclass/kdd2010b.html"> - <b>6.4.</b> + <b>6.5.</b> - KDD2010b + KDD2010b tutorial </a> @@ -1029,12 +1050,12 @@ <ul class="articles"> - <li class="chapter " data-level="6.4.1" data-path="../binaryclass/kdd2010b_dataset.html"> + <li class="chapter " data-level="6.5.1" data-path="../binaryclass/kdd2010b_dataset.html"> <a href="../binaryclass/kdd2010b_dataset.html"> - <b>6.4.1.</b> + <b>6.5.1.</b> Data preparation @@ -1044,12 +1065,12 @@ </li> - <li class="chapter " data-level="6.4.2" data-path="../binaryclass/kdd2010b_arow.html"> + <li class="chapter " data-level="6.5.2" data-path="../binaryclass/kdd2010b_arow.html"> <a href="../binaryclass/kdd2010b_arow.html"> - <b>6.4.2.</b> + <b>6.5.2.</b> AROW @@ -1064,14 +1085,14 @@ </li> - <li class="chapter " data-level="6.5" data-path="../binaryclass/webspam.html"> + <li class="chapter " data-level="6.6" data-path="../binaryclass/webspam.html"> <a href="../binaryclass/webspam.html"> - <b>6.5.</b> + <b>6.6.</b> - Webspam + Webspam tutorial </a> @@ -1080,12 +1101,12 @@ <ul class="articles"> - <li class="chapter " data-level="6.5.1" data-path="../binaryclass/webspam_dataset.html"> + <li class="chapter " data-level="6.6.1" data-path="../binaryclass/webspam_dataset.html"> <a href="../binaryclass/webspam_dataset.html"> - <b>6.5.1.</b> + <b>6.6.1.</b> Data pareparation @@ -1095,12 +1116,12 @@ </li> - <li class="chapter " data-level="6.5.2" data-path="../binaryclass/webspam_scw.html"> + <li class="chapter " data-level="6.6.2" data-path="../binaryclass/webspam_scw.html"> <a href="../binaryclass/webspam_scw.html"> - <b>6.5.2.</b> + <b>6.6.2.</b> PA1, AROW, SCW @@ -1115,14 +1136,14 @@ </li> - <li class="chapter " data-level="6.6" data-path="../binaryclass/titanic_rf.html"> + <li class="chapter " data-level="6.7" data-path="../binaryclass/titanic_rf.html"> <a href="../binaryclass/titanic_rf.html"> - <b>6.6.</b> + <b>6.7.</b> - Kaggle Titanic + Kaggle Titanic tutorial </a> @@ -1133,7 +1154,7 @@ - <li class="header">Part VII - Multiclass classification tutorials</li> + <li class="header">Part VII - Multiclass classification</li> @@ -1144,7 +1165,7 @@ <b>7.1.</b> - News20 Multiclass + News20 Multiclass tutorial </a> @@ -1255,7 +1276,7 @@ <b>7.2.</b> - Iris + Iris tutorial </a> @@ -1317,18 +1338,33 @@ - <li class="header">Part VIII - Regression tutorials</li> + <li class="header">Part VIII - Regression</li> - <li class="chapter " data-level="8.1" data-path="../regression/e2006.html"> + <li class="chapter " data-level="8.1" data-path="../regression/general.html"> - <a href="../regression/e2006.html"> + <a href="../regression/general.html"> <b>8.1.</b> - E2006-tfidf regression + Regression + + </a> + + + + </li> + + <li class="chapter " data-level="8.2" data-path="../regression/e2006.html"> + + <a href="../regression/e2006.html"> + + + <b>8.2.</b> + + E2006-tfidf regression tutorial </a> @@ -1337,12 +1373,12 @@ <ul class="articles"> - <li class="chapter " data-level="8.1.1" data-path="../regression/e2006_dataset.html"> + <li class="chapter " data-level="8.2.1" data-path="../regression/e2006_dataset.html"> <a href="../regression/e2006_dataset.html"> - <b>8.1.1.</b> + <b>8.2.1.</b> Data preparation @@ -1352,12 +1388,12 @@ </li> - <li class="chapter " data-level="8.1.2" data-path="../regression/e2006_arow.html"> + <li class="chapter " data-level="8.2.2" data-path="../regression/e2006_arow.html"> <a href="../regression/e2006_arow.html"> - <b>8.1.2.</b> + <b>8.2.2.</b> Passive Aggressive, AROW @@ -1372,14 +1408,14 @@ </li> - <li class="chapter " data-level="8.2" data-path="../regression/kddcup12tr2.html"> + <li class="chapter " data-level="8.3" data-path="../regression/kddcup12tr2.html"> <a href="../regression/kddcup12tr2.html"> - <b>8.2.</b> + <b>8.3.</b> - KDDCup 2012 track 2 CTR prediction + KDDCup 2012 track 2 CTR prediction tutorial </a> @@ -1388,12 +1424,12 @@ <ul class="articles"> - <li class="chapter " data-level="8.2.1" data-path="../regression/kddcup12tr2_dataset.html"> + <li class="chapter " data-level="8.3.1" data-path="../regression/kddcup12tr2_dataset.html"> <a href="../regression/kddcup12tr2_dataset.html"> - <b>8.2.1.</b> + <b>8.3.1.</b> Data preparation @@ -1403,12 +1439,12 @@ </li> - <li class="chapter " data-level="8.2.2" data-path="../regression/kddcup12tr2_lr.html"> + <li class="chapter " data-level="8.3.2" data-path="../regression/kddcup12tr2_lr.html"> <a href="../regression/kddcup12tr2_lr.html"> - <b>8.2.2.</b> + <b>8.3.2.</b> Logistic Regression, Passive Aggressive @@ -1418,12 +1454,12 @@ </li> - <li class="chapter " data-level="8.2.3" data-path="../regression/kddcup12tr2_lr_amplify.html"> + <li class="chapter " data-level="8.3.3" data-path="../regression/kddcup12tr2_lr_amplify.html"> <a href="../regression/kddcup12tr2_lr_amplify.html"> - <b>8.2.3.</b> + <b>8.3.3.</b> Logistic Regression with Amplifier @@ -1433,12 +1469,12 @@ </li> - <li class="chapter " data-level="8.2.4" data-path="../regression/kddcup12tr2_adagrad.html"> + <li class="chapter " data-level="8.3.4" data-path="../regression/kddcup12tr2_adagrad.html"> <a href="../regression/kddcup12tr2_adagrad.html"> - <b>8.2.4.</b> + <b>8.3.4.</b> AdaGrad, AdaDelta @@ -2220,7 +2256,7 @@ Apache Hivemall is an effort undergoing incubation at The Apache Software Founda <script> var gitbook = gitbook || []; gitbook.push(function() { - gitbook.page.hasChanged({"page":{"title":"Getting Started","level":"14.1","depth":1,"next":{"title":"Hivemall on Apache Spark","level":"15.1","depth":1,"url":"https://github.com/maropu/hivemall-spark","ref":"https://github.com/maropu/hivemall-spark","articles":[]},"previous":{"title":"Other utility functions","level":"13.4.2","depth":2,"path":"spark/misc/functions.md","ref":"spark/misc/functions.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"h2lb":3,"header":1,"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.c om/apache/incubator-hivemall/"},"splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md","hline":"true"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css "},"showLevel":true},"anchorjs":{"selector":"h1,h2,h3,*:not(.callout) > h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User Manual for Apache Hivemall"},"file":{"path":"docker/getting_started.md","mtime":"2017-06-15T10:18:54.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"2017-06-15T10:33:21.138Z"},"basePath":"..","book":{"language":""}}); + gitbook.page.hasChanged({"page":{"title":"Getting Started","level":"14.1","depth":1,"next":{"title":"Hivemall on Apache Spark","level":"15.1","depth":1,"url":"https://github.com/maropu/hivemall-spark","ref":"https://github.com/maropu/hivemall-spark","articles":[]},"previous":{"title":"Other utility functions","level":"13.4.2","depth":2,"path":"spark/misc/functions.md","ref":"spark/misc/functions.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"h2lb":3,"header":1,"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.c om/apache/incubator-hivemall/"},"splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md","hline":"true"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css "},"showLevel":true},"anchorjs":{"selector":"h1,h2,h3,*:not(.callout) > h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User Manual for Apache Hivemall"},"file":{"path":"docker/getting_started.md","mtime":"2017-06-06T06:40:03.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"2017-06-23T09:59:20.878Z"},"basePath":"..","book":{"language":""}}); }); </script> </div> http://git-wip-us.apache.org/repos/asf/incubator-hivemall-site/blob/ba518dab/userguide/eval/auc.html ---------------------------------------------------------------------- diff --git a/userguide/eval/auc.html b/userguide/eval/auc.html index 044075b..ad66883 100644 --- a/userguide/eval/auc.html +++ b/userguide/eval/auc.html @@ -598,14 +598,30 @@ </li> - <li class="chapter " data-level="3.5" data-path="../ft_engineering/tfidf.html"> + <li class="chapter " data-level="3.5" data-path="../ft_engineering/pairing.html"> - <a href="../ft_engineering/tfidf.html"> + <a href="../ft_engineering/pairing.html"> <b>3.5.</b> - TF-IDF Calculation + FEATURE PAIRING + + </a> + + + + <ul class="articles"> + + + <li class="chapter " data-level="3.5.1" data-path="../ft_engineering/polynomial.html"> + + <a href="../ft_engineering/polynomial.html"> + + + <b>3.5.1.</b> + + Polynomial Features </a> @@ -613,6 +629,11 @@ </li> + + </ul> + + </li> + <li class="chapter " data-level="3.6" data-path="../ft_engineering/ft_trans.html"> <a href="../ft_engineering/ft_trans.html"> @@ -664,6 +685,21 @@ </li> + <li class="chapter " data-level="3.7" data-path="../ft_engineering/tfidf.html"> + + <a href="../ft_engineering/tfidf.html"> + + + <b>3.7.</b> + + TF-IDF Calculation + + </a> + + + + </li> + @@ -761,7 +797,7 @@ - <li class="header">Part V - Prediction</li> + <li class="header">Part V - Supervised Learning</li> @@ -780,27 +816,19 @@ </li> - <li class="chapter " data-level="5.2" data-path="../regression/general.html"> - - <a href="../regression/general.html"> - - - <b>5.2.</b> - - Regression - - </a> - - - </li> - <li class="chapter " data-level="5.3" data-path="../binaryclass/general.html"> + + <li class="header">Part VI - Binary classification</li> + + + + <li class="chapter " data-level="6.1" data-path="../binaryclass/general.html"> <a href="../binaryclass/general.html"> - <b>5.3.</b> + <b>6.1.</b> Binary Classification @@ -810,21 +838,14 @@ </li> - - - - <li class="header">Part VI - Binary classification tutorials</li> - - - - <li class="chapter " data-level="6.1" data-path="../binaryclass/a9a.html"> + <li class="chapter " data-level="6.2" data-path="../binaryclass/a9a.html"> <a href="../binaryclass/a9a.html"> - <b>6.1.</b> + <b>6.2.</b> - a9a + a9a tutorial </a> @@ -833,12 +854,12 @@ <ul class="articles"> - <li class="chapter " data-level="6.1.1" data-path="../binaryclass/a9a_dataset.html"> + <li class="chapter " data-level="6.2.1" data-path="../binaryclass/a9a_dataset.html"> <a href="../binaryclass/a9a_dataset.html"> - <b>6.1.1.</b> + <b>6.2.1.</b> Data preparation @@ -848,12 +869,12 @@ </li> - <li class="chapter " data-level="6.1.2" data-path="../binaryclass/a9a_lr.html"> + <li class="chapter " data-level="6.2.2" data-path="../binaryclass/a9a_lr.html"> <a href="../binaryclass/a9a_lr.html"> - <b>6.1.2.</b> + <b>6.2.2.</b> Logistic Regression @@ -863,12 +884,12 @@ </li> - <li class="chapter " data-level="6.1.3" data-path="../binaryclass/a9a_minibatch.html"> + <li class="chapter " data-level="6.2.3" data-path="../binaryclass/a9a_minibatch.html"> <a href="../binaryclass/a9a_minibatch.html"> - <b>6.1.3.</b> + <b>6.2.3.</b> Mini-batch Gradient Descent @@ -883,14 +904,14 @@ </li> - <li class="chapter " data-level="6.2" data-path="../binaryclass/news20.html"> + <li class="chapter " data-level="6.3" data-path="../binaryclass/news20.html"> <a href="../binaryclass/news20.html"> - <b>6.2.</b> + <b>6.3.</b> - News20 + News20 tutorial </a> @@ -899,12 +920,12 @@ <ul class="articles"> - <li class="chapter " data-level="6.2.1" data-path="../binaryclass/news20_dataset.html"> + <li class="chapter " data-level="6.3.1" data-path="../binaryclass/news20_dataset.html"> <a href="../binaryclass/news20_dataset.html"> - <b>6.2.1.</b> + <b>6.3.1.</b> Data preparation @@ -914,12 +935,12 @@ </li> - <li class="chapter " data-level="6.2.2" data-path="../binaryclass/news20_pa.html"> + <li class="chapter " data-level="6.3.2" data-path="../binaryclass/news20_pa.html"> <a href="../binaryclass/news20_pa.html"> - <b>6.2.2.</b> + <b>6.3.2.</b> Perceptron, Passive Aggressive @@ -929,12 +950,12 @@ </li> - <li class="chapter " data-level="6.2.3" data-path="../binaryclass/news20_scw.html"> + <li class="chapter " data-level="6.3.3" data-path="../binaryclass/news20_scw.html"> <a href="../binaryclass/news20_scw.html"> - <b>6.2.3.</b> + <b>6.3.3.</b> CW, AROW, SCW @@ -944,12 +965,12 @@ </li> - <li class="chapter " data-level="6.2.4" data-path="../binaryclass/news20_adagrad.html"> + <li class="chapter " data-level="6.3.4" data-path="../binaryclass/news20_adagrad.html"> <a href="../binaryclass/news20_adagrad.html"> - <b>6.2.4.</b> + <b>6.3.4.</b> AdaGradRDA, AdaGrad, AdaDelta @@ -964,14 +985,14 @@ </li> - <li class="chapter " data-level="6.3" data-path="../binaryclass/kdd2010a.html"> + <li class="chapter " data-level="6.4" data-path="../binaryclass/kdd2010a.html"> <a href="../binaryclass/kdd2010a.html"> - <b>6.3.</b> + <b>6.4.</b> - KDD2010a + KDD2010a tutorial </a> @@ -980,12 +1001,12 @@ <ul class="articles"> - <li class="chapter " data-level="6.3.1" data-path="../binaryclass/kdd2010a_dataset.html"> + <li class="chapter " data-level="6.4.1" data-path="../binaryclass/kdd2010a_dataset.html"> <a href="../binaryclass/kdd2010a_dataset.html"> - <b>6.3.1.</b> + <b>6.4.1.</b> Data preparation @@ -995,12 +1016,12 @@ </li> - <li class="chapter " data-level="6.3.2" data-path="../binaryclass/kdd2010a_scw.html"> + <li class="chapter " data-level="6.4.2" data-path="../binaryclass/kdd2010a_scw.html"> <a href="../binaryclass/kdd2010a_scw.html"> - <b>6.3.2.</b> + <b>6.4.2.</b> PA, CW, AROW, SCW @@ -1015,14 +1036,14 @@ </li> - <li class="chapter " data-level="6.4" data-path="../binaryclass/kdd2010b.html"> + <li class="chapter " data-level="6.5" data-path="../binaryclass/kdd2010b.html"> <a href="../binaryclass/kdd2010b.html"> - <b>6.4.</b> + <b>6.5.</b> - KDD2010b + KDD2010b tutorial </a> @@ -1031,12 +1052,12 @@ <ul class="articles"> - <li class="chapter " data-level="6.4.1" data-path="../binaryclass/kdd2010b_dataset.html"> + <li class="chapter " data-level="6.5.1" data-path="../binaryclass/kdd2010b_dataset.html"> <a href="../binaryclass/kdd2010b_dataset.html"> - <b>6.4.1.</b> + <b>6.5.1.</b> Data preparation @@ -1046,12 +1067,12 @@ </li> - <li class="chapter " data-level="6.4.2" data-path="../binaryclass/kdd2010b_arow.html"> + <li class="chapter " data-level="6.5.2" data-path="../binaryclass/kdd2010b_arow.html"> <a href="../binaryclass/kdd2010b_arow.html"> - <b>6.4.2.</b> + <b>6.5.2.</b> AROW @@ -1066,14 +1087,14 @@ </li> - <li class="chapter " data-level="6.5" data-path="../binaryclass/webspam.html"> + <li class="chapter " data-level="6.6" data-path="../binaryclass/webspam.html"> <a href="../binaryclass/webspam.html"> - <b>6.5.</b> + <b>6.6.</b> - Webspam + Webspam tutorial </a> @@ -1082,12 +1103,12 @@ <ul class="articles"> - <li class="chapter " data-level="6.5.1" data-path="../binaryclass/webspam_dataset.html"> + <li class="chapter " data-level="6.6.1" data-path="../binaryclass/webspam_dataset.html"> <a href="../binaryclass/webspam_dataset.html"> - <b>6.5.1.</b> + <b>6.6.1.</b> Data pareparation @@ -1097,12 +1118,12 @@ </li> - <li class="chapter " data-level="6.5.2" data-path="../binaryclass/webspam_scw.html"> + <li class="chapter " data-level="6.6.2" data-path="../binaryclass/webspam_scw.html"> <a href="../binaryclass/webspam_scw.html"> - <b>6.5.2.</b> + <b>6.6.2.</b> PA1, AROW, SCW @@ -1117,14 +1138,14 @@ </li> - <li class="chapter " data-level="6.6" data-path="../binaryclass/titanic_rf.html"> + <li class="chapter " data-level="6.7" data-path="../binaryclass/titanic_rf.html"> <a href="../binaryclass/titanic_rf.html"> - <b>6.6.</b> + <b>6.7.</b> - Kaggle Titanic + Kaggle Titanic tutorial </a> @@ -1135,7 +1156,7 @@ - <li class="header">Part VII - Multiclass classification tutorials</li> + <li class="header">Part VII - Multiclass classification</li> @@ -1146,7 +1167,7 @@ <b>7.1.</b> - News20 Multiclass + News20 Multiclass tutorial </a> @@ -1257,7 +1278,7 @@ <b>7.2.</b> - Iris + Iris tutorial </a> @@ -1319,18 +1340,33 @@ - <li class="header">Part VIII - Regression tutorials</li> + <li class="header">Part VIII - Regression</li> - <li class="chapter " data-level="8.1" data-path="../regression/e2006.html"> + <li class="chapter " data-level="8.1" data-path="../regression/general.html"> - <a href="../regression/e2006.html"> + <a href="../regression/general.html"> <b>8.1.</b> - E2006-tfidf regression + Regression + + </a> + + + + </li> + + <li class="chapter " data-level="8.2" data-path="../regression/e2006.html"> + + <a href="../regression/e2006.html"> + + + <b>8.2.</b> + + E2006-tfidf regression tutorial </a> @@ -1339,12 +1375,12 @@ <ul class="articles"> - <li class="chapter " data-level="8.1.1" data-path="../regression/e2006_dataset.html"> + <li class="chapter " data-level="8.2.1" data-path="../regression/e2006_dataset.html"> <a href="../regression/e2006_dataset.html"> - <b>8.1.1.</b> + <b>8.2.1.</b> Data preparation @@ -1354,12 +1390,12 @@ </li> - <li class="chapter " data-level="8.1.2" data-path="../regression/e2006_arow.html"> + <li class="chapter " data-level="8.2.2" data-path="../regression/e2006_arow.html"> <a href="../regression/e2006_arow.html"> - <b>8.1.2.</b> + <b>8.2.2.</b> Passive Aggressive, AROW @@ -1374,14 +1410,14 @@ </li> - <li class="chapter " data-level="8.2" data-path="../regression/kddcup12tr2.html"> + <li class="chapter " data-level="8.3" data-path="../regression/kddcup12tr2.html"> <a href="../regression/kddcup12tr2.html"> - <b>8.2.</b> + <b>8.3.</b> - KDDCup 2012 track 2 CTR prediction + KDDCup 2012 track 2 CTR prediction tutorial </a> @@ -1390,12 +1426,12 @@ <ul class="articles"> - <li class="chapter " data-level="8.2.1" data-path="../regression/kddcup12tr2_dataset.html"> + <li class="chapter " data-level="8.3.1" data-path="../regression/kddcup12tr2_dataset.html"> <a href="../regression/kddcup12tr2_dataset.html"> - <b>8.2.1.</b> + <b>8.3.1.</b> Data preparation @@ -1405,12 +1441,12 @@ </li> - <li class="chapter " data-level="8.2.2" data-path="../regression/kddcup12tr2_lr.html"> + <li class="chapter " data-level="8.3.2" data-path="../regression/kddcup12tr2_lr.html"> <a href="../regression/kddcup12tr2_lr.html"> - <b>8.2.2.</b> + <b>8.3.2.</b> Logistic Regression, Passive Aggressive @@ -1420,12 +1456,12 @@ </li> - <li class="chapter " data-level="8.2.3" data-path="../regression/kddcup12tr2_lr_amplify.html"> + <li class="chapter " data-level="8.3.3" data-path="../regression/kddcup12tr2_lr_amplify.html"> <a href="../regression/kddcup12tr2_lr_amplify.html"> - <b>8.2.3.</b> + <b>8.3.3.</b> Logistic Regression with Amplifier @@ -1435,12 +1471,12 @@ </li> - <li class="chapter " data-level="8.2.4" data-path="../regression/kddcup12tr2_adagrad.html"> + <li class="chapter " data-level="8.3.4" data-path="../regression/kddcup12tr2_adagrad.html"> <a href="../regression/kddcup12tr2_adagrad.html"> - <b>8.2.4.</b> + <b>8.3.4.</b> AdaGrad, AdaDelta @@ -2078,7 +2114,11 @@ <ul> <li><a href="#area-under-the-roc-curve">Area Under the ROC Curve</a></li> -<li><a href="#compute-auc-on-hivemall">Compute AUC on Hivemall</a></li> +<li><a href="#compute-auc-on-hivemall">Compute AUC on Hivemall</a><ul> +<li><a href="#sequential-auc-computation-on-a-single-node">Sequential AUC computation on a single node</a></li> +<li><a href="#parallel-approximate-auc-computation">Parallel approximate AUC computation</a></li> +</ul> +</li> <li><a href="#difference-between-auc-and-logarithmic-loss">Difference between AUC and Logarithmic Loss</a></li> </ul> @@ -2120,7 +2160,8 @@ <p>Once the rows are sorted by the probabilities in a descending order, AUC gives a metric based on how many positive (<code>label=1</code>) samples are ranked higher than negative (<code>label=0</code>) samples. If many positive rows get larger scores than negative rows, AUC would be large, and hence our classifier would perform well.</p> <h1 id="compute-auc-on-hivemall">Compute AUC on Hivemall</h1> <p>In Hivemall, a function <code>auc(double score, int label)</code> provides a way to compute AUC for pairs of probability and truth label.</p> -<p>For instance, following query computes AUC of the table which was shown above:</p> +<h2 id="sequential-auc-computation-on-a-single-node">Sequential AUC computation on a single node</h2> +<p>For instance, the following query computes AUC of the table which was shown above:</p> <pre><code class="lang-sql">with data as ( <span class="hljs-keyword">select</span> <span class="hljs-number">0.5</span> <span class="hljs-keyword">as</span> prob, <span class="hljs-number">0</span> <span class="hljs-keyword">as</span> label <span class="hljs-keyword">union</span> all @@ -2142,6 +2183,7 @@ </code></pre> <p>This query returns <code>0.83333</code> as AUC.</p> <p>Since AUC is a metric based on ranked probability-label pairs as mentioned above, input data (rows) needs to be ordered by scores in a descending order.</p> +<h2 id="parallel-approximate-auc-computation">Parallel approximate AUC computation</h2> <p>Meanwhile, Hive's <code>distribute by</code> clause allows you to compute AUC in parallel: </p> <pre><code class="lang-sql">with data as ( <span class="hljs-keyword">select</span> <span class="hljs-number">0.5</span> <span class="hljs-keyword">as</span> prob, <span class="hljs-number">0</span> <span class="hljs-keyword">as</span> label @@ -2154,7 +2196,8 @@ <span class="hljs-keyword">union</span> all <span class="hljs-keyword">select</span> <span class="hljs-number">0.7</span> <span class="hljs-keyword">as</span> prob, <span class="hljs-number">1</span> <span class="hljs-keyword">as</span> label ) -<span class="hljs-keyword">select</span> auc(prob, label) <span class="hljs-keyword">as</span> auc +<span class="hljs-keyword">select</span> + auc(prob, label) <span class="hljs-keyword">as</span> auc <span class="hljs-keyword">from</span> ( <span class="hljs-keyword">select</span> prob, label <span class="hljs-keyword">from</span> <span class="hljs-keyword">data</span> @@ -2222,7 +2265,7 @@ Apache Hivemall is an effort undergoing incubation at The Apache Software Founda <script> var gitbook = gitbook || []; gitbook.push(function() { - gitbook.page.hasChanged({"page":{"title":"Area Under the ROC Curve","level":"4.1.1","depth":2,"next":{"title":"Ranking Measures","level":"4.2","depth":1,"path":"eval/rank.md","ref":"eval/rank.md","articles":[]},"previous":{"title":"Statistical evaluation of a prediction model","level":"4.1","depth":1,"path":"eval/stat_eval.md","ref":"eval/stat_eval.md","articles":[{"title":"Area Under the ROC Curve","level":"4.1.1","depth":2,"path":"eval/auc.md","ref":"eval/auc.md","articles":[]}]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"h2lb":3,"header":1,"max depth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.com/apache/incubator-hivemall/"},"splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md","hline":"true"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":" styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":true},"anchorjs":{"selector":"h1,h2,h3,*:not(.callout) > h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User Manual for Apache Hivemall"},"file":{"path":"eval/auc.md","mtime":"2017-05-11T07:50:39.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"2017-06-15T10:33:21.138Z"},"basePath":"..","book":{"language":""}}); + gitbook.page.hasChanged({"page":{"title":"Area Under the ROC Curve","level":"4.1.1","depth":2,"next":{"title":"Ranking Measures","level":"4.2","depth":1,"path":"eval/rank.md","ref":"eval/rank.md","articles":[]},"previous":{"title":"Statistical evaluation of a prediction model","level":"4.1","depth":1,"path":"eval/stat_eval.md","ref":"eval/stat_eval.md","articles":[{"title":"Area Under the ROC Curve","level":"4.1.1","depth":2,"path":"eval/auc.md","ref":"eval/auc.md","articles":[]}]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"h2lb":3,"header":1,"max depth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.com/apache/incubator-hivemall/"},"splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md","hline":"true"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":" styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":true},"anchorjs":{"selector":"h1,h2,h3,*:not(.callout) > h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User Manual for Apache Hivemall"},"file":{"path":"eval/auc.md","mtime":"2017-06-23T09:56:22.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"2017-06-23T09:59:20.878Z"},"basePath":"..","book":{"language":""}}); }); </script> </div>
