http://git-wip-us.apache.org/repos/asf/incubator-hivemall-site/blob/ceeebcea/userguide/binaryclass/webspam_dataset.html
----------------------------------------------------------------------
diff --git a/userguide/binaryclass/webspam_dataset.html 
b/userguide/binaryclass/webspam_dataset.html
index 61092a0..ae9cec6 100644
--- a/userguide/binaryclass/webspam_dataset.html
+++ b/userguide/binaryclass/webspam_dataset.html
@@ -1667,6 +1667,21 @@
             
         </li>
     
+        <li class="chapter " data-level="10.2" 
data-path="../clustering/plsa.html">
+            
+                <a href="../clustering/plsa.html">
+            
+                    
+                        <b>10.2.</b>
+                    
+                    Probabilistic Latent Semantic Analysis
+            
+                </a>
+            
+
+            
+        </li>
+    
 
     
         
@@ -2102,7 +2117,7 @@ Apache Hivemall is an effort undergoing incubation at The 
Apache Software Founda
     <script>
         var gitbook = gitbook || [];
         gitbook.push(function() {
-            gitbook.page.hasChanged({"page":{"title":"Data 
pareparation","level":"5.5.1","depth":2,"next":{"title":"PA1, AROW, 
SCW","level":"5.5.2","depth":2,"path":"binaryclass/webspam_scw.md","ref":"binaryclass/webspam_scw.md","articles":[]},"previous":{"title":"Webspam
 
Tutorial","level":"5.5","depth":1,"path":"binaryclass/webspam.md","ref":"binaryclass/webspam.md","articles":[{"title":"Data
 
pareparation","level":"5.5.1","depth":2,"path":"binaryclass/webspam_dataset.md","ref":"binaryclass/webspam_dataset.md","articles":[]},{"title":"PA1,
 AROW, 
SCW","level":"5.5.2","depth":2,"path":"binaryclass/webspam_scw.md","ref":"binaryclass/webspam_scw.md","articles":[]}]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"sty
 
les/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"header":1,"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.com/apache/incubator-hivemall/"},"splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/
 
incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":true},"anchorjs":{"selector":"h1,h2,h3,*:not(.callout)
 > 
h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall
 User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> 
Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User
 Manual for Apache 
Hivemall"},"file":{"path":"binaryclass/webspam_dataset.md","mtime":"2016-12-02T08:02:42.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"20
 17-04-25T12:37:57.844Z"},"basePath":"..","book":{"language":""}});
+            gitbook.page.hasChanged({"page":{"title":"Data 
pareparation","level":"5.5.1","depth":2,"next":{"title":"PA1, AROW, 
SCW","level":"5.5.2","depth":2,"path":"binaryclass/webspam_scw.md","ref":"binaryclass/webspam_scw.md","articles":[]},"previous":{"title":"Webspam
 
Tutorial","level":"5.5","depth":1,"path":"binaryclass/webspam.md","ref":"binaryclass/webspam.md","articles":[{"title":"Data
 
pareparation","level":"5.5.1","depth":2,"path":"binaryclass/webspam_dataset.md","ref":"binaryclass/webspam_dataset.md","articles":[]},{"title":"PA1,
 AROW, 
SCW","level":"5.5.2","depth":2,"path":"binaryclass/webspam_scw.md","ref":"binaryclass/webspam_scw.md","articles":[]}]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"sty
 
les/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"header":1,"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.com/apache/incubator-hivemall/"},"splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/
 
incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":true},"anchorjs":{"selector":"h1,h2,h3,*:not(.callout)
 > 
h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall
 User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> 
Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User
 Manual for Apache 
Hivemall"},"file":{"path":"binaryclass/webspam_dataset.md","mtime":"2016-12-02T08:02:42.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"20
 17-04-27T13:49:22.144Z"},"basePath":"..","book":{"language":""}});
         });
     </script>
 </div>

http://git-wip-us.apache.org/repos/asf/incubator-hivemall-site/blob/ceeebcea/userguide/binaryclass/webspam_scw.html
----------------------------------------------------------------------
diff --git a/userguide/binaryclass/webspam_scw.html 
b/userguide/binaryclass/webspam_scw.html
index aca73a7..ae9fff3 100644
--- a/userguide/binaryclass/webspam_scw.html
+++ b/userguide/binaryclass/webspam_scw.html
@@ -1667,6 +1667,21 @@
             
         </li>
     
+        <li class="chapter " data-level="10.2" 
data-path="../clustering/plsa.html">
+            
+                <a href="../clustering/plsa.html">
+            
+                    
+                        <b>10.2.</b>
+                    
+                    Probabilistic Latent Semantic Analysis
+            
+                </a>
+            
+
+            
+        </li>
+    
 
     
         
@@ -2162,7 +2177,7 @@ Apache Hivemall is an effort undergoing incubation at The 
Apache Software Founda
     <script>
         var gitbook = gitbook || [];
         gitbook.push(function() {
-            gitbook.page.hasChanged({"page":{"title":"PA1, AROW, 
SCW","level":"5.5.2","depth":2,"next":{"title":"Kaggle Titanic 
Tutorial","level":"5.6","depth":1,"path":"binaryclass/titanic_rf.md","ref":"binaryclass/titanic_rf.md","articles":[]},"previous":{"title":"Data
 
pareparation","level":"5.5.1","depth":2,"path":"binaryclass/webspam_dataset.md","ref":"binaryclass/webspam_dataset.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"header":1,"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.com/apache/incubator-hivemall/"},";
 
splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":true},"anchorjs":{"selector":"h1,
 h2,h3,*:not(.callout) > 
h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall
 User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> 
Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User
 Manual for Apache 
Hivemall"},"file":{"path":"binaryclass/webspam_scw.md","mtime":"2016-12-02T08:02:42.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"2017-04-25T12:37:57.844Z"},"basePath":"..","book":{"language":""}});
+            gitbook.page.hasChanged({"page":{"title":"PA1, AROW, 
SCW","level":"5.5.2","depth":2,"next":{"title":"Kaggle Titanic 
Tutorial","level":"5.6","depth":1,"path":"binaryclass/titanic_rf.md","ref":"binaryclass/titanic_rf.md","articles":[]},"previous":{"title":"Data
 
pareparation","level":"5.5.1","depth":2,"path":"binaryclass/webspam_dataset.md","ref":"binaryclass/webspam_dataset.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"header":1,"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.com/apache/incubator-hivemall/"},";
 
splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":true},"anchorjs":{"selector":"h1,
 h2,h3,*:not(.callout) > 
h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall
 User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> 
Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User
 Manual for Apache 
Hivemall"},"file":{"path":"binaryclass/webspam_scw.md","mtime":"2016-12-02T08:02:42.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"2017-04-27T13:49:22.144Z"},"basePath":"..","book":{"language":""}});
         });
     </script>
 </div>

http://git-wip-us.apache.org/repos/asf/incubator-hivemall-site/blob/ceeebcea/userguide/clustering/lda.html
----------------------------------------------------------------------
diff --git a/userguide/clustering/lda.html b/userguide/clustering/lda.html
index f312a4a..ad158a0 100644
--- a/userguide/clustering/lda.html
+++ b/userguide/clustering/lda.html
@@ -97,7 +97,7 @@
     <link rel="shortcut icon" href="../gitbook/images/favicon.ico" 
type="image/x-icon">
 
     
-    <link rel="next" href="../geospatial/latlon.html" />
+    <link rel="next" href="plsa.html" />
     
     
     <link rel="prev" href="../anomaly/changefinder.html" />
@@ -1667,6 +1667,21 @@
             
         </li>
     
+        <li class="chapter " data-level="10.2" data-path="plsa.html">
+            
+                <a href="plsa.html">
+            
+                    
+                        <b>10.2.</b>
+                    
+                    Probabilistic Latent Semantic Analysis
+            
+                </a>
+            
+
+            
+        </li>
+    
 
     
         
@@ -2320,7 +2335,7 @@ Apache Hivemall is an effort undergoing incubation at The 
Apache Software Founda
     <script>
         var gitbook = gitbook || [];
         gitbook.push(function() {
-            gitbook.page.hasChanged({"page":{"title":"Latent Dirichlet 
Allocation","level":"10.1","depth":1,"next":{"title":"Lat/Lon 
functions","level":"11.1","depth":1,"path":"geospatial/latlon.md","ref":"geospatial/latlon.md","articles":[]},"previous":{"title":"ChangeFinder:
 Detecting Outlier and Change-Point 
Simultaneously","level":"9.3","depth":1,"path":"anomaly/changefinder.md","ref":"anomaly/changefinder.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"header":1,"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.com/apa
 
che/incubator-hivemall/"},"splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":true},
 "anchorjs":{"selector":"h1,h2,h3,*:not(.callout) > 
h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall
 User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> 
Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User
 Manual for Apache 
Hivemall"},"file":{"path":"clustering/lda.md","mtime":"2017-04-20T07:32:10.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"2017-04-25T12:37:57.844Z"},"basePath":"..","book":{"language":""}});
+            gitbook.page.hasChanged({"page":{"title":"Latent Dirichlet 
Allocation","level":"10.1","depth":1,"next":{"title":"Probabilistic Latent 
Semantic 
Analysis","level":"10.2","depth":1,"path":"clustering/plsa.md","ref":"clustering/plsa.md","articles":[]},"previous":{"title":"ChangeFinder:
 Detecting Outlier and Change-Point 
Simultaneously","level":"9.3","depth":1,"path":"anomaly/changefinder.md","ref":"anomaly/changefinder.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"header":1,"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https
 
://github.com/apache/incubator-hivemall/"},"splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},";
 showLevel":true},"anchorjs":{"selector":"h1,h2,h3,*:not(.callout) > 
h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall
 User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> 
Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User
 Manual for Apache 
Hivemall"},"file":{"path":"clustering/lda.md","mtime":"2017-04-20T07:32:10.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"2017-04-27T13:49:22.144Z"},"basePath":"..","book":{"language":""}});
         });
     </script>
 </div>

http://git-wip-us.apache.org/repos/asf/incubator-hivemall-site/blob/ceeebcea/userguide/clustering/plsa.html
----------------------------------------------------------------------
diff --git a/userguide/clustering/plsa.html b/userguide/clustering/plsa.html
new file mode 100644
index 0000000..1a680ca
--- /dev/null
+++ b/userguide/clustering/plsa.html
@@ -0,0 +1,2354 @@
+
+<!DOCTYPE HTML>
+<html lang="" >
+    <head>
+        <meta charset="UTF-8">
+        <meta content="text/html; charset=utf-8" http-equiv="Content-Type">
+        <title>Probabilistic Latent Semantic Analysis · Hivemall User 
Manual</title>
+        <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+        <meta name="description" content="">
+        <meta name="generator" content="GitBook 3.2.2">
+        
+        
+        
+    
+    <link rel="stylesheet" href="../gitbook/style.css">
+
+    
+            
+                
+                <link rel="stylesheet" 
href="../gitbook/gitbook-plugin-splitter/splitter.css">
+                
+            
+                
+                <link rel="stylesheet" 
href="../gitbook/gitbook-plugin-etoc/plugin.css">
+                
+            
+                
+                <link rel="stylesheet" 
href="../gitbook/gitbook-plugin-callouts/plugin.css">
+                
+            
+                
+                <link rel="stylesheet" 
href="../gitbook/gitbook-plugin-toggle-chapters/toggle.css">
+                
+            
+                
+                <link rel="stylesheet" 
href="../gitbook/gitbook-plugin-codeblock-filename/block.css">
+                
+            
+                
+                <link rel="stylesheet" 
href="../gitbook/gitbook-plugin-expandable-chapters/expandable-chapters.css">
+                
+            
+                
+                <link rel="stylesheet" 
href="../gitbook/gitbook-plugin-multipart/multipart.css">
+                
+            
+                
+                <link rel="stylesheet" 
href="../gitbook/gitbook-plugin-katex/katex.min.css">
+                
+            
+                
+                <link rel="stylesheet" 
href="../gitbook/gitbook-plugin-emphasize/plugin.css">
+                
+            
+                
+                <link rel="stylesheet" 
href="../gitbook/gitbook-plugin-highlight/website.css">
+                
+            
+                
+                <link rel="stylesheet" 
href="../gitbook/gitbook-plugin-search/search.css">
+                
+            
+                
+                <link rel="stylesheet" 
href="../gitbook/gitbook-plugin-fontsettings/website.css">
+                
+            
+                
+                <link rel="stylesheet" 
href="../gitbook/gitbook-plugin-theme-api/theme-api.css">
+                
+            
+        
+
+    
+
+    
+        
+    
+        
+    
+        
+    
+        
+    
+        
+    
+        
+    
+
+        
+    
+    
+    <meta name="HandheldFriendly" content="true"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1, 
user-scalable=no">
+    <meta name="apple-mobile-web-app-capable" content="yes">
+    <meta name="apple-mobile-web-app-status-bar-style" content="black">
+    <link rel="apple-touch-icon-precomposed" sizes="152x152" 
href="../gitbook/images/apple-touch-icon-precomposed-152.png">
+    <link rel="shortcut icon" href="../gitbook/images/favicon.ico" 
type="image/x-icon">
+
+    
+    <link rel="next" href="../geospatial/latlon.html" />
+    
+    
+    <link rel="prev" href="lda.html" />
+    
+
+    </head>
+    <body>
+        
+<div class="book">
+    <div class="book-summary">
+        
+            
+<div id="book-search-input" role="search">
+    <input type="text" placeholder="Type to search" />
+</div>
+
+            
+                <nav role="navigation">
+                
+
+
+<ul class="summary">
+    
+    
+    
+        
+        <li>
+            <a href="http://hivemall.incubator.apache.org/"; target="_blank" 
class="custom-link"><i class="fa fa-home"></i> Home</a>
+        </li>
+    
+    
+
+    
+    <li class="divider"></li>
+    
+
+    
+        
+        <li class="header">TABLE OF CONTENTS</li>
+        
+        
+    
+        <li class="chapter " data-level="1.1" data-path="../">
+            
+                <a href="../">
+            
+                    
+                        <b>1.1.</b>
+                    
+                    Introduction
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.2" data-path="../getting_started/">
+            
+                <a href="../getting_started/">
+            
+                    
+                        <b>1.2.</b>
+                    
+                    Getting Started
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.2.1" 
data-path="../getting_started/installation.html">
+            
+                <a href="../getting_started/installation.html">
+            
+                    
+                        <b>1.2.1.</b>
+                    
+                    Installation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.2.2" 
data-path="../getting_started/permanent-functions.html">
+            
+                <a href="../getting_started/permanent-functions.html">
+            
+                    
+                        <b>1.2.2.</b>
+                    
+                    Install as permanent functions
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.2.3" 
data-path="../getting_started/input-format.html">
+            
+                <a href="../getting_started/input-format.html">
+            
+                    
+                        <b>1.2.3.</b>
+                    
+                    Input Format
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="1.3" data-path="../tips/">
+            
+                <a href="../tips/">
+            
+                    
+                        <b>1.3.</b>
+                    
+                    Tips for Effective Hivemall
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.3.1" 
data-path="../tips/addbias.html">
+            
+                <a href="../tips/addbias.html">
+            
+                    
+                        <b>1.3.1.</b>
+                    
+                    Explicit addBias() for better prediction
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.3.2" 
data-path="../tips/rand_amplify.html">
+            
+                <a href="../tips/rand_amplify.html">
+            
+                    
+                        <b>1.3.2.</b>
+                    
+                    Use rand_amplify() to better prediction results
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.3.3" 
data-path="../tips/rt_prediction.html">
+            
+                <a href="../tips/rt_prediction.html">
+            
+                    
+                        <b>1.3.3.</b>
+                    
+                    Real-time Prediction on RDBMS
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.3.4" 
data-path="../tips/ensemble_learning.html">
+            
+                <a href="../tips/ensemble_learning.html">
+            
+                    
+                        <b>1.3.4.</b>
+                    
+                    Ensemble learning for stable prediction
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.3.5" 
data-path="../tips/mixserver.html">
+            
+                <a href="../tips/mixserver.html">
+            
+                    
+                        <b>1.3.5.</b>
+                    
+                    Mixing models for a better prediction convergence (MIX 
server)
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.3.6" data-path="../tips/emr.html">
+            
+                <a href="../tips/emr.html">
+            
+                    
+                        <b>1.3.6.</b>
+                    
+                    Run Hivemall on Amazon Elastic MapReduce
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="1.4" 
data-path="../tips/general_tips.html">
+            
+                <a href="../tips/general_tips.html">
+            
+                    
+                        <b>1.4.</b>
+                    
+                    General Hive/Hadoop tips
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.4.1" data-path="../tips/rowid.html">
+            
+                <a href="../tips/rowid.html">
+            
+                    
+                        <b>1.4.1.</b>
+                    
+                    Adding rowid for each row
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.4.2" 
data-path="../tips/hadoop_tuning.html">
+            
+                <a href="../tips/hadoop_tuning.html">
+            
+                    
+                        <b>1.4.2.</b>
+                    
+                    Hadoop tuning for Hivemall
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="1.5" data-path="../troubleshooting/">
+            
+                <a href="../troubleshooting/">
+            
+                    
+                        <b>1.5.</b>
+                    
+                    Troubleshooting
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.5.1" 
data-path="../troubleshooting/oom.html">
+            
+                <a href="../troubleshooting/oom.html">
+            
+                    
+                        <b>1.5.1.</b>
+                    
+                    OutOfMemoryError in training
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.5.2" 
data-path="../troubleshooting/mapjoin_task_error.html">
+            
+                <a href="../troubleshooting/mapjoin_task_error.html">
+            
+                    
+                        <b>1.5.2.</b>
+                    
+                    SemanticException Generate Map Join Task Error: Cannot 
serialize object
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.5.3" 
data-path="../troubleshooting/asterisk.html">
+            
+                <a href="../troubleshooting/asterisk.html">
+            
+                    
+                        <b>1.5.3.</b>
+                    
+                    Asterisk argument for UDTF does not work
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.5.4" 
data-path="../troubleshooting/num_mappers.html">
+            
+                <a href="../troubleshooting/num_mappers.html">
+            
+                    
+                        <b>1.5.4.</b>
+                    
+                    The number of mappers is less than input splits in Hadoop 
2.x
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.5.5" 
data-path="../troubleshooting/mapjoin_classcastex.html">
+            
+                <a href="../troubleshooting/mapjoin_classcastex.html">
+            
+                    
+                        <b>1.5.5.</b>
+                    
+                    Map-side Join causes ClassCastException on Tez
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part II - Generic Features</li>
+        
+        
+    
+        <li class="chapter " data-level="2.1" 
data-path="../misc/generic_funcs.html">
+            
+                <a href="../misc/generic_funcs.html">
+            
+                    
+                        <b>2.1.</b>
+                    
+                    List of generic Hivemall functions
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="2.2" data-path="../misc/topk.html">
+            
+                <a href="../misc/topk.html">
+            
+                    
+                        <b>2.2.</b>
+                    
+                    Efficient Top-K query processing
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="2.3" 
data-path="../misc/tokenizer.html">
+            
+                <a href="../misc/tokenizer.html">
+            
+                    
+                        <b>2.3.</b>
+                    
+                    English/Japanese Text Tokenizer
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part III - Feature Engineering</li>
+        
+        
+    
+        <li class="chapter " data-level="3.1" 
data-path="../ft_engineering/scaling.html">
+            
+                <a href="../ft_engineering/scaling.html">
+            
+                    
+                        <b>3.1.</b>
+                    
+                    Feature Scaling
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="3.2" 
data-path="../ft_engineering/hashing.html">
+            
+                <a href="../ft_engineering/hashing.html">
+            
+                    
+                        <b>3.2.</b>
+                    
+                    Feature Hashing
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="3.3" 
data-path="../ft_engineering/tfidf.html">
+            
+                <a href="../ft_engineering/tfidf.html">
+            
+                    
+                        <b>3.3.</b>
+                    
+                    TF-IDF calculation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="3.4" 
data-path="../ft_engineering/ft_trans.html">
+            
+                <a href="../ft_engineering/ft_trans.html">
+            
+                    
+                        <b>3.4.</b>
+                    
+                    FEATURE TRANSFORMATION
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="3.4.1" 
data-path="../ft_engineering/vectorizer.html">
+            
+                <a href="../ft_engineering/vectorizer.html">
+            
+                    
+                        <b>3.4.1.</b>
+                    
+                    Vectorize Features
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="3.4.2" 
data-path="../ft_engineering/quantify.html">
+            
+                <a href="../ft_engineering/quantify.html">
+            
+                    
+                        <b>3.4.2.</b>
+                    
+                    Quantify non-number features
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="3.5" 
data-path="../ft_engineering/feature_selection.html">
+            
+                <a href="../ft_engineering/feature_selection.html">
+            
+                    
+                        <b>3.5.</b>
+                    
+                    Feature selection
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part IV - Evaluation</li>
+        
+        
+    
+        <li class="chapter " data-level="4.1" 
data-path="../eval/stat_eval.html">
+            
+                <a href="../eval/stat_eval.html">
+            
+                    
+                        <b>4.1.</b>
+                    
+                    Statistical evaluation of a prediction model
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="4.1.1" data-path="../eval/auc.html">
+            
+                <a href="../eval/auc.html">
+            
+                    
+                        <b>4.1.1.</b>
+                    
+                    Area Under the ROC Curve
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="4.2" data-path="../eval/rank.html">
+            
+                <a href="../eval/rank.html">
+            
+                    
+                        <b>4.2.</b>
+                    
+                    Ranking Measures
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="4.3" data-path="../eval/datagen.html">
+            
+                <a href="../eval/datagen.html">
+            
+                    
+                        <b>4.3.</b>
+                    
+                    Data Generation
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="4.3.1" 
data-path="../eval/lr_datagen.html">
+            
+                <a href="../eval/lr_datagen.html">
+            
+                    
+                        <b>4.3.1.</b>
+                    
+                    Logistic Regression data generation
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part V - Binary classification</li>
+        
+        
+    
+        <li class="chapter " data-level="5.1" 
data-path="../binaryclass/a9a.html">
+            
+                <a href="../binaryclass/a9a.html">
+            
+                    
+                        <b>5.1.</b>
+                    
+                    a9a Tutorial
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="5.1.1" 
data-path="../binaryclass/a9a_dataset.html">
+            
+                <a href="../binaryclass/a9a_dataset.html">
+            
+                    
+                        <b>5.1.1.</b>
+                    
+                    Data preparation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="5.1.2" 
data-path="../binaryclass/a9a_lr.html">
+            
+                <a href="../binaryclass/a9a_lr.html">
+            
+                    
+                        <b>5.1.2.</b>
+                    
+                    Logistic Regression
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="5.1.3" 
data-path="../binaryclass/a9a_minibatch.html">
+            
+                <a href="../binaryclass/a9a_minibatch.html">
+            
+                    
+                        <b>5.1.3.</b>
+                    
+                    Mini-batch Gradient Descent
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="5.2" 
data-path="../binaryclass/news20.html">
+            
+                <a href="../binaryclass/news20.html">
+            
+                    
+                        <b>5.2.</b>
+                    
+                    News20 Tutorial
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="5.2.1" 
data-path="../binaryclass/news20_dataset.html">
+            
+                <a href="../binaryclass/news20_dataset.html">
+            
+                    
+                        <b>5.2.1.</b>
+                    
+                    Data preparation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="5.2.2" 
data-path="../binaryclass/news20_pa.html">
+            
+                <a href="../binaryclass/news20_pa.html">
+            
+                    
+                        <b>5.2.2.</b>
+                    
+                    Perceptron, Passive Aggressive
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="5.2.3" 
data-path="../binaryclass/news20_scw.html">
+            
+                <a href="../binaryclass/news20_scw.html">
+            
+                    
+                        <b>5.2.3.</b>
+                    
+                    CW, AROW, SCW
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="5.2.4" 
data-path="../binaryclass/news20_adagrad.html">
+            
+                <a href="../binaryclass/news20_adagrad.html">
+            
+                    
+                        <b>5.2.4.</b>
+                    
+                    AdaGradRDA, AdaGrad, AdaDelta
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="5.3" 
data-path="../binaryclass/kdd2010a.html">
+            
+                <a href="../binaryclass/kdd2010a.html">
+            
+                    
+                        <b>5.3.</b>
+                    
+                    KDD2010a Tutorial
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="5.3.1" 
data-path="../binaryclass/kdd2010a_dataset.html">
+            
+                <a href="../binaryclass/kdd2010a_dataset.html">
+            
+                    
+                        <b>5.3.1.</b>
+                    
+                    Data preparation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="5.3.2" 
data-path="../binaryclass/kdd2010a_scw.html">
+            
+                <a href="../binaryclass/kdd2010a_scw.html">
+            
+                    
+                        <b>5.3.2.</b>
+                    
+                    PA, CW, AROW, SCW
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="5.4" 
data-path="../binaryclass/kdd2010b.html">
+            
+                <a href="../binaryclass/kdd2010b.html">
+            
+                    
+                        <b>5.4.</b>
+                    
+                    KDD2010b Tutorial
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="5.4.1" 
data-path="../binaryclass/kdd2010b_dataset.html">
+            
+                <a href="../binaryclass/kdd2010b_dataset.html">
+            
+                    
+                        <b>5.4.1.</b>
+                    
+                    Data preparation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="5.4.2" 
data-path="../binaryclass/kdd2010b_arow.html">
+            
+                <a href="../binaryclass/kdd2010b_arow.html">
+            
+                    
+                        <b>5.4.2.</b>
+                    
+                    AROW
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="5.5" 
data-path="../binaryclass/webspam.html">
+            
+                <a href="../binaryclass/webspam.html">
+            
+                    
+                        <b>5.5.</b>
+                    
+                    Webspam Tutorial
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="5.5.1" 
data-path="../binaryclass/webspam_dataset.html">
+            
+                <a href="../binaryclass/webspam_dataset.html">
+            
+                    
+                        <b>5.5.1.</b>
+                    
+                    Data pareparation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="5.5.2" 
data-path="../binaryclass/webspam_scw.html">
+            
+                <a href="../binaryclass/webspam_scw.html">
+            
+                    
+                        <b>5.5.2.</b>
+                    
+                    PA1, AROW, SCW
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="5.6" 
data-path="../binaryclass/titanic_rf.html">
+            
+                <a href="../binaryclass/titanic_rf.html">
+            
+                    
+                        <b>5.6.</b>
+                    
+                    Kaggle Titanic Tutorial
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part VI - Multiclass classification</li>
+        
+        
+    
+        <li class="chapter " data-level="6.1" 
data-path="../multiclass/news20.html">
+            
+                <a href="../multiclass/news20.html">
+            
+                    
+                        <b>6.1.</b>
+                    
+                    News20 Multiclass Tutorial
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="6.1.1" 
data-path="../multiclass/news20_dataset.html">
+            
+                <a href="../multiclass/news20_dataset.html">
+            
+                    
+                        <b>6.1.1.</b>
+                    
+                    Data preparation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="6.1.2" 
data-path="../multiclass/news20_one-vs-the-rest_dataset.html">
+            
+                <a href="../multiclass/news20_one-vs-the-rest_dataset.html">
+            
+                    
+                        <b>6.1.2.</b>
+                    
+                    Data preparation for one-vs-the-rest classifiers
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="6.1.3" 
data-path="../multiclass/news20_pa.html">
+            
+                <a href="../multiclass/news20_pa.html">
+            
+                    
+                        <b>6.1.3.</b>
+                    
+                    PA
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="6.1.4" 
data-path="../multiclass/news20_scw.html">
+            
+                <a href="../multiclass/news20_scw.html">
+            
+                    
+                        <b>6.1.4.</b>
+                    
+                    CW, AROW, SCW
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="6.1.5" 
data-path="../multiclass/news20_ensemble.html">
+            
+                <a href="../multiclass/news20_ensemble.html">
+            
+                    
+                        <b>6.1.5.</b>
+                    
+                    Ensemble learning
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="6.1.6" 
data-path="../multiclass/news20_one-vs-the-rest.html">
+            
+                <a href="../multiclass/news20_one-vs-the-rest.html">
+            
+                    
+                        <b>6.1.6.</b>
+                    
+                    one-vs-the-rest classifier
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="6.2" 
data-path="../multiclass/iris.html">
+            
+                <a href="../multiclass/iris.html">
+            
+                    
+                        <b>6.2.</b>
+                    
+                    Iris Tutorial
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="6.2.1" 
data-path="../multiclass/iris_dataset.html">
+            
+                <a href="../multiclass/iris_dataset.html">
+            
+                    
+                        <b>6.2.1.</b>
+                    
+                    Data preparation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="6.2.2" 
data-path="../multiclass/iris_scw.html">
+            
+                <a href="../multiclass/iris_scw.html">
+            
+                    
+                        <b>6.2.2.</b>
+                    
+                    SCW
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="6.2.3" 
data-path="../multiclass/iris_randomforest.html">
+            
+                <a href="../multiclass/iris_randomforest.html">
+            
+                    
+                        <b>6.2.3.</b>
+                    
+                    RandomForest
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part VII - Regression</li>
+        
+        
+    
+        <li class="chapter " data-level="7.1" 
data-path="../regression/e2006.html">
+            
+                <a href="../regression/e2006.html">
+            
+                    
+                        <b>7.1.</b>
+                    
+                    E2006-tfidf regression Tutorial
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="7.1.1" 
data-path="../regression/e2006_dataset.html">
+            
+                <a href="../regression/e2006_dataset.html">
+            
+                    
+                        <b>7.1.1.</b>
+                    
+                    Data preparation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="7.1.2" 
data-path="../regression/e2006_arow.html">
+            
+                <a href="../regression/e2006_arow.html">
+            
+                    
+                        <b>7.1.2.</b>
+                    
+                    Passive Aggressive, AROW
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="7.2" 
data-path="../regression/kddcup12tr2.html">
+            
+                <a href="../regression/kddcup12tr2.html">
+            
+                    
+                        <b>7.2.</b>
+                    
+                    KDDCup 2012 track 2 CTR prediction Tutorial
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="7.2.1" 
data-path="../regression/kddcup12tr2_dataset.html">
+            
+                <a href="../regression/kddcup12tr2_dataset.html">
+            
+                    
+                        <b>7.2.1.</b>
+                    
+                    Data preparation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="7.2.2" 
data-path="../regression/kddcup12tr2_lr.html">
+            
+                <a href="../regression/kddcup12tr2_lr.html">
+            
+                    
+                        <b>7.2.2.</b>
+                    
+                    Logistic Regression, Passive Aggressive
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="7.2.3" 
data-path="../regression/kddcup12tr2_lr_amplify.html">
+            
+                <a href="../regression/kddcup12tr2_lr_amplify.html">
+            
+                    
+                        <b>7.2.3.</b>
+                    
+                    Logistic Regression with Amplifier
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="7.2.4" 
data-path="../regression/kddcup12tr2_adagrad.html">
+            
+                <a href="../regression/kddcup12tr2_adagrad.html">
+            
+                    
+                        <b>7.2.4.</b>
+                    
+                    AdaGrad, AdaDelta
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part VIII - Recommendation</li>
+        
+        
+    
+        <li class="chapter " data-level="8.1" data-path="../recommend/cf.html">
+            
+                <a href="../recommend/cf.html">
+            
+                    
+                        <b>8.1.</b>
+                    
+                    Collaborative Filtering
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="8.1.1" 
data-path="../recommend/item_based_cf.html">
+            
+                <a href="../recommend/item_based_cf.html">
+            
+                    
+                        <b>8.1.1.</b>
+                    
+                    Item-based Collaborative Filtering
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="8.2" 
data-path="../recommend/news20.html">
+            
+                <a href="../recommend/news20.html">
+            
+                    
+                        <b>8.2.</b>
+                    
+                    News20 related article recommendation Tutorial
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="8.2.1" 
data-path="../multiclass/news20_dataset.html">
+            
+                <a href="../multiclass/news20_dataset.html">
+            
+                    
+                        <b>8.2.1.</b>
+                    
+                    Data preparation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="8.2.2" 
data-path="../recommend/news20_jaccard.html">
+            
+                <a href="../recommend/news20_jaccard.html">
+            
+                    
+                        <b>8.2.2.</b>
+                    
+                    LSH/Minhash and Jaccard Similarity
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="8.2.3" 
data-path="../recommend/news20_knn.html">
+            
+                <a href="../recommend/news20_knn.html">
+            
+                    
+                        <b>8.2.3.</b>
+                    
+                    LSH/Minhash and Brute-Force Search
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="8.2.4" 
data-path="../recommend/news20_bbit_minhash.html">
+            
+                <a href="../recommend/news20_bbit_minhash.html">
+            
+                    
+                        <b>8.2.4.</b>
+                    
+                    kNN search using b-Bits Minhash
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="8.3" 
data-path="../recommend/movielens.html">
+            
+                <a href="../recommend/movielens.html">
+            
+                    
+                        <b>8.3.</b>
+                    
+                    MovieLens movie recommendation Tutorial
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="8.3.1" 
data-path="../recommend/movielens_dataset.html">
+            
+                <a href="../recommend/movielens_dataset.html">
+            
+                    
+                        <b>8.3.1.</b>
+                    
+                    Data preparation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="8.3.2" 
data-path="../recommend/movielens_mf.html">
+            
+                <a href="../recommend/movielens_mf.html">
+            
+                    
+                        <b>8.3.2.</b>
+                    
+                    Matrix Factorization
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="8.3.3" 
data-path="../recommend/movielens_fm.html">
+            
+                <a href="../recommend/movielens_fm.html">
+            
+                    
+                        <b>8.3.3.</b>
+                    
+                    Factorization Machine
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="8.3.4" 
data-path="../recommend/movielens_cv.html">
+            
+                <a href="../recommend/movielens_cv.html">
+            
+                    
+                        <b>8.3.4.</b>
+                    
+                    10-fold Cross Validation (Matrix Factorization)
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part IX - Anomaly Detection</li>
+        
+        
+    
+        <li class="chapter " data-level="9.1" data-path="../anomaly/lof.html">
+            
+                <a href="../anomaly/lof.html">
+            
+                    
+                        <b>9.1.</b>
+                    
+                    Outlier Detection using Local Outlier Factor (LOF)
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="9.2" data-path="../anomaly/sst.html">
+            
+                <a href="../anomaly/sst.html">
+            
+                    
+                        <b>9.2.</b>
+                    
+                    Change-Point Detection using Singular Spectrum 
Transformation (SST)
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="9.3" 
data-path="../anomaly/changefinder.html">
+            
+                <a href="../anomaly/changefinder.html">
+            
+                    
+                        <b>9.3.</b>
+                    
+                    ChangeFinder: Detecting Outlier and Change-Point 
Simultaneously
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part X - Clustering</li>
+        
+        
+    
+        <li class="chapter " data-level="10.1" data-path="lda.html">
+            
+                <a href="lda.html">
+            
+                    
+                        <b>10.1.</b>
+                    
+                    Latent Dirichlet Allocation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter active" data-level="10.2" data-path="plsa.html">
+            
+                <a href="plsa.html">
+            
+                    
+                        <b>10.2.</b>
+                    
+                    Probabilistic Latent Semantic Analysis
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part XI - GeoSpatial functions</li>
+        
+        
+    
+        <li class="chapter " data-level="11.1" 
data-path="../geospatial/latlon.html">
+            
+                <a href="../geospatial/latlon.html">
+            
+                    
+                        <b>11.1.</b>
+                    
+                    Lat/Lon functions
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part XII - Hivemall on Spark</li>
+        
+        
+    
+        <li class="chapter " data-level="12.1" 
data-path="../spark/getting_started/">
+            
+                <a href="../spark/getting_started/">
+            
+                    
+                        <b>12.1.</b>
+                    
+                    Getting Started
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="12.1.1" 
data-path="../spark/getting_started/installation.html">
+            
+                <a href="../spark/getting_started/installation.html">
+            
+                    
+                        <b>12.1.1.</b>
+                    
+                    Installation
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="12.2" 
data-path="../spark/binaryclass/">
+            
+                <a href="../spark/binaryclass/">
+            
+                    
+                        <b>12.2.</b>
+                    
+                    Binary Classification
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="12.2.1" 
data-path="../spark/binaryclass/a9a_df.html">
+            
+                <a href="../spark/binaryclass/a9a_df.html">
+            
+                    
+                        <b>12.2.1.</b>
+                    
+                    a9a Tutorial for DataFrame
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="12.3" 
data-path="../spark/binaryclass/">
+            
+                <a href="../spark/binaryclass/">
+            
+                    
+                        <b>12.3.</b>
+                    
+                    Regression
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="12.3.1" 
data-path="../spark/regression/e2006_df.html">
+            
+                <a href="../spark/regression/e2006_df.html">
+            
+                    
+                        <b>12.3.1.</b>
+                    
+                    E2006-tfidf regression Tutorial for DataFrame
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="12.4" 
data-path="../spark/misc/misc.html">
+            
+                <a href="../spark/misc/misc.html">
+            
+                    
+                        <b>12.4.</b>
+                    
+                    Generic features
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="12.4.1" 
data-path="../spark/misc/topk_join.html">
+            
+                <a href="../spark/misc/topk_join.html">
+            
+                    
+                        <b>12.4.1.</b>
+                    
+                    Top-k Join processing
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="12.4.2" 
data-path="../spark/misc/functions.html">
+            
+                <a href="../spark/misc/functions.html">
+            
+                    
+                        <b>12.4.2.</b>
+                    
+                    Other utility functions
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part XIII - Hivemall on Docker</li>
+        
+        
+    
+        <li class="chapter " data-level="13.1" 
data-path="../docker/getting_started.html">
+            
+                <a href="../docker/getting_started.html">
+            
+                    
+                        <b>13.1.</b>
+                    
+                    Getting Started
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part XIV - External References</li>
+        
+        
+    
+        <li class="chapter " data-level="14.1" >
+            
+                <a target="_blank" 
href="https://github.com/maropu/hivemall-spark";>
+            
+                    
+                        <b>14.1.</b>
+                    
+                    Hivemall on Apache Spark
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="14.2" >
+            
+                <a target="_blank" 
href="https://github.com/daijyc/hivemall/wiki/PigHome";>
+            
+                    
+                        <b>14.2.</b>
+                    
+                    Hivemall on Apache Pig
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+    
+
+    <li class="divider"></li>
+
+    <li>
+        <a href="https://www.gitbook.com"; target="blank" class="gitbook-link">
+            Published with GitBook
+        </a>
+    </li>
+</ul>
+
+
+                </nav>
+            
+        
+    </div>
+
+    <div class="book-body">
+        
+            <div class="body-inner">
+                
+                    
+
+<div class="book-header" role="navigation">
+    
+
+    <!-- Title -->
+    <h1>
+        <i class="fa fa-circle-o-notch fa-spin"></i>
+        <a href=".." >Probabilistic Latent Semantic Analysis</a>
+    </h1>
+</div>
+
+
+
+
+                    <div class="page-wrapper" tabindex="-1" role="main">
+                        <div class="page-inner">
+                            
+<div id="book-search-results">
+    <div class="search-noresults">
+    
+                                <section class="normal markdown-section">
+                                
+                                <!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<p>As described in <a href="lda.html">our user guide for Latent Dirichlet 
Allocation (LDA)</a>, Hivemall enables you to apply clustering for your data 
based on a topic modeling technique. While LDA is one of the most popular 
techniques, there is another approach named <strong>Probabilistic Latent 
Semantic Analysis</strong> (pLSA). In fact, pLSA is the predecessor of LDA, but 
it has an advantage in terms of running time.</p>
+<ul>
+<li>T. Hofmann. <a href="http://dl.acm.org/citation.cfm?id=312649"; 
target="_blank">Probabilistic Latent Semantic Indexing</a>. SIGIR 1999, pp. 
50-57.</li>
+<li>T. Hofmann. <a 
href="http://www.iro.umontreal.ca/~nie/IFT6255/Hofmann-UAI99.pdf"; 
target="_blank">Probabilistic Latent Semantic Analysis</a>. UAI 1999, pp. 
289-296.</li>
+</ul>
+<p>In order to efficiently handle large-scale data, our pLSA implementation is 
based on the following incremental variant of the original pLSA algorithm:</p>
+<ul>
+<li>H. Wu, et al. <a href="http://dl.acm.org/citation.cfm?id=1454026"; 
target="_blank">Incremental Probabilistic Latent Semantic Analysis for 
Automatic Question Recommendation</a>. RecSys 2008, pp. 99-106.</li>
+</ul>
+<!-- toc --><div id="toc" class="toc">
+
+<ul>
+<li><a href="#usage">Usage</a></li>
+<li><a href="#difference-with-lda">Difference with LDA</a></li>
+<li><a href="#setting-hyper-parameter-alpha">Setting hyper-parameter 
<code>alpha</code></a></li>
+</ul>
+
+</div><!-- tocstop -->
+<div class="panel panel-primary"><div class="panel-heading"><h3 
class="panel-title" id="note"><i class="fa fa-edit"></i> Note</h3></div><div 
class="panel-body"><p>This feature is supported from Hivemall v0.5-rc.1 or 
later.</p></div></div>
+<h1 id="usage">Usage</h1>
+<p>Basically, you can use our pLSA function in a similar way to LDA.</p>
+<p>In particular, we have two pLSA functions, <code>train_plsa()</code> and 
<code>plsa_predict()</code>. These functions can be used almost interchangeably 
with <code>train_lda()</code> and <code>lda_predict()</code>. Thus, reading <a 
href="lda.html">our user guide for LDA</a> should be helpful before trying 
pLSA.</p>
+<p>In short, for the sample <code>docs</code> table we introduced in the LDA 
tutorial:</p>
+<table>
+<thead>
+<tr>
+<th style="text-align:center">docid</th>
+<th style="text-align:left">doc</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align:center">1</td>
+<td style="text-align:left">&quot;Fruits and vegetables are healthy.&quot;</td>
+</tr>
+<tr>
+<td style="text-align:center">2</td>
+<td style="text-align:left">&quot;I like apples, oranges, and avocados. I do 
not like the flu or colds.&quot;</td>
+</tr>
+<tr>
+<td style="text-align:center">...</td>
+<td style="text-align:left">...</td>
+</tr>
+</tbody>
+</table>
+<p>a pLSA model can be built as follows:</p>
+<pre><code class="lang-sql">with word_counts as (
+  <span class="hljs-keyword">select</span>
+    docid,
+    feature(word, <span class="hljs-keyword">count</span>(word)) <span 
class="hljs-keyword">as</span> f
+  <span class="hljs-keyword">from</span> docs t1 lateral <span 
class="hljs-keyword">view</span> explode(tokenize(doc, <span 
class="hljs-literal">true</span>)) t2 <span class="hljs-keyword">as</span> word
+  <span class="hljs-keyword">where</span>
+    <span class="hljs-keyword">not</span> is_stopword(word)
+  <span class="hljs-keyword">group</span> <span class="hljs-keyword">by</span>
+    docid, word
+)
+<span class="hljs-keyword">select</span>
+    train_plsa(feature, <span class="hljs-string">&quot;-topics 2 -eps 0.00001 
-iter 2048 -alpha 0.01&quot;</span>) <span class="hljs-keyword">as</span> 
(label, word, prob)
+<span class="hljs-keyword">from</span> (
+  <span class="hljs-keyword">select</span> docid, collect_set(f) <span 
class="hljs-keyword">as</span> feature
+  <span class="hljs-keyword">from</span> word_counts
+  <span class="hljs-keyword">group</span> <span class="hljs-keyword">by</span> 
docid
+) t
+;
+</code></pre>
+<table>
+<thead>
+<tr>
+<th style="text-align:center">label</th>
+<th style="text-align:center">word</th>
+<th style="text-align:center">prob</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align:center">0</td>
+<td style="text-align:center">like</td>
+<td style="text-align:center">0.28549945</td>
+</tr>
+<tr>
+<td style="text-align:center">0</td>
+<td style="text-align:center">colds</td>
+<td style="text-align:center">0.14294468</td>
+</tr>
+<tr>
+<td style="text-align:center">0</td>
+<td style="text-align:center">apples</td>
+<td style="text-align:center">0.14291435</td>
+</tr>
+<tr>
+<td style="text-align:center">0</td>
+<td style="text-align:center">avocados</td>
+<td style="text-align:center">0.1428958</td>
+</tr>
+<tr>
+<td style="text-align:center">0</td>
+<td style="text-align:center">flu</td>
+<td style="text-align:center">0.14287639</td>
+</tr>
+<tr>
+<td style="text-align:center">0</td>
+<td style="text-align:center">oranges</td>
+<td style="text-align:center">0.1428691</td>
+</tr>
+<tr>
+<td style="text-align:center">0</td>
+<td style="text-align:center">healthy</td>
+<td style="text-align:center">1.2605103E-7</td>
+</tr>
+<tr>
+<td style="text-align:center">0</td>
+<td style="text-align:center">fruits</td>
+<td style="text-align:center">4.772253E-8</td>
+</tr>
+<tr>
+<td style="text-align:center">0</td>
+<td style="text-align:center">vegetables</td>
+<td style="text-align:center">1.929087E-8</td>
+</tr>
+<tr>
+<td style="text-align:center">1</td>
+<td style="text-align:center">vegetables</td>
+<td style="text-align:center">0.32713377</td>
+</tr>
+<tr>
+<td style="text-align:center">1</td>
+<td style="text-align:center">fruits</td>
+<td style="text-align:center">0.32713372</td>
+</tr>
+<tr>
+<td style="text-align:center">1</td>
+<td style="text-align:center">healthy</td>
+<td style="text-align:center">0.3271335</td>
+</tr>
+<tr>
+<td style="text-align:center">1</td>
+<td style="text-align:center">like</td>
+<td style="text-align:center">0.006977764</td>
+</tr>
+<tr>
+<td style="text-align:center">1</td>
+<td style="text-align:center">oranges</td>
+<td style="text-align:center">0.0025642214</td>
+</tr>
+<tr>
+<td style="text-align:center">1</td>
+<td style="text-align:center">flu</td>
+<td style="text-align:center">0.002507711</td>
+</tr>
+<tr>
+<td style="text-align:center">1</td>
+<td style="text-align:center">avocados</td>
+<td style="text-align:center">0.0023572792</td>
+</tr>
+<tr>
+<td style="text-align:center">1</td>
+<td style="text-align:center">apples</td>
+<td style="text-align:center">0.002213457</td>
+</tr>
+<tr>
+<td style="text-align:center">1</td>
+<td style="text-align:center">colds</td>
+<td style="text-align:center">0.001978546</td>
+</tr>
+</tbody>
+</table>
+<p>And prediction can be done as:</p>
+<pre><code class="lang-sql">test as (
+  <span class="hljs-keyword">select</span>
+    docid,
+    word,
+    <span class="hljs-keyword">count</span>(word) <span 
class="hljs-keyword">as</span> <span class="hljs-keyword">value</span>
+  <span class="hljs-keyword">from</span> docs t1 LATERAL <span 
class="hljs-keyword">VIEW</span> explode(tokenize(doc, <span 
class="hljs-literal">true</span>)) t2 <span class="hljs-keyword">as</span> word
+  <span class="hljs-keyword">where</span>
+    <span class="hljs-keyword">not</span> is_stopword(word)
+  <span class="hljs-keyword">group</span> <span class="hljs-keyword">by</span>
+    docid, word
+),
+topic <span class="hljs-keyword">as</span> (
+  <span class="hljs-keyword">select</span>
+    t.docid,
+    plsa_predict(t.word, t.<span class="hljs-keyword">value</span>, m.label, 
m.prob, <span class="hljs-string">&quot;-topics 2&quot;</span>) <span 
class="hljs-keyword">as</span> probabilities
+  <span class="hljs-keyword">from</span>
+    <span class="hljs-keyword">test</span> t
+    <span class="hljs-keyword">JOIN</span> plsa_model m <span 
class="hljs-keyword">ON</span> (t.word = m.word)
+  <span class="hljs-keyword">group</span> <span class="hljs-keyword">by</span>
+    t.docid
+)
+<span class="hljs-keyword">select</span> docid, probabilities, 
probabilities[<span class="hljs-number">0</span>].label, m.words <span 
class="hljs-comment">-- topic each document should be assigned</span>
+<span class="hljs-keyword">from</span> topic t
+<span class="hljs-keyword">join</span> (
+  <span class="hljs-keyword">select</span> label, collect_set(feature(word, 
prob)) <span class="hljs-keyword">as</span> words
+  <span class="hljs-keyword">from</span> plsa_model
+  <span class="hljs-keyword">group</span> <span class="hljs-keyword">by</span> 
label
+) m <span class="hljs-keyword">on</span> t.probabilities[<span 
class="hljs-number">0</span>].label = m.label
+;
+</code></pre>
+<table>
+<thead>
+<tr>
+<th style="text-align:center">docid</th>
+<th style="text-align:left">probabilities</th>
+<th style="text-align:center">label</th>
+<th style="text-align:left">m.words</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align:center">1</td>
+<td 
style="text-align:left">[{&quot;label&quot;:1,&quot;probability&quot;:0.72298235},{&quot;label&quot;:0,&quot;probability&quot;:0.27701768}]</td>
+<td style="text-align:center">1</td>
+<td 
style="text-align:left">[&quot;vegetables:0.32713377&quot;,&quot;fruits:0.32713372&quot;,&quot;healthy:0.3271335&quot;,&quot;like:0.006977764&quot;,&quot;oranges:0.0025642214&quot;,&quot;flu:0.002507711&quot;,&quot;avocados:0.0023572792&quot;,&quot;apples:0.002213457&quot;,&quot;colds:0.001978546&quot;]</td>
+</tr>
+<tr>
+<td style="text-align:center">2</td>
+<td 
style="text-align:left">[{&quot;label&quot;:0,&quot;probability&quot;:0.7052526},{&quot;label&quot;:1,&quot;probability&quot;:0.2947474}]</td>
+<td style="text-align:center">0</td>
+<td 
style="text-align:left">[&quot;like:0.28549945&quot;,&quot;colds:0.14294468&quot;,&quot;apples:0.14291435&quot;,&quot;avocados:0.1428958&quot;,&quot;flu:0.14287639&quot;,&quot;oranges:0.1428691&quot;,&quot;healthy:1.2605103E-7&quot;,&quot;fruits:4.772253E-8&quot;,&quot;vegetables:1.929087E-8&quot;]</td>
+</tr>
+</tbody>
+</table>
+<h1 id="difference-with-lda">Difference with LDA</h1>
+<p>The main advantage of using pLSA is its efficiency. Since mathematical 
formulation and optimization logic is much simpler than LDA, using pLSA 
generally requires much shorter running time.</p>
+<p>In terms of accuracy, LDA could be better than pLSA. For example, a word 
<code>like</code> appears twice in the above sample document#2 gets larger 
probabilities both in topic#1 and #2, even though one document does not contain 
the word. By contrast, LDA results (i.e., <em>lambda</em> values) are more 
clearly separated as shown in <a href="lda.html">the LDA page</a>. Thus, a pLSA 
model is likely to be biased.</p>
+<p>For the reasons that we mentioned above, we recommend you to first use LDA. 
After that, if you encountered problems such as slow running time and 
undesirable clustering results, let you try alternative pLSA approach.</p>
+<h1 id="setting-hyper-parameter-alpha">Setting hyper-parameter 
<code>alpha</code></h1>
+<p>For training pLSA, we set a hyper-parameter <code>alpha</code> in the above 
example:</p>
+<pre><code class="lang-sql"><span class="hljs-keyword">SELECT</span> 
train_plsa(feature, <span class="hljs-string">&quot;-topics 2 -eps 0.00001 
-iter 2048 -alpha 0.01&quot;</span>)
+</code></pre>
+<p>This value controls <strong>how much iterative model update is affected by 
the old results</strong>.</p>
+<p>From an algorithmic point of view, training pLSA (and LDA) iteratively 
repeats certain operations and updates the target value (i.e., probability 
obtained as a result of <code>train_plsa()</code>). This iterative procedure 
gradually makes the probabilities more accurate. What <code>alpha</code> does 
is to control the degree of the change of probabilities in each step.</p>
+<p>Normally, <code>alpha</code> is set to a small value from 0.0 to 0.5 
(default is 0.5).
+<div id="page-footer"><hr><!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<p><sub><font color="gray">
+Apache Hivemall is an effort undergoing incubation at The Apache Software 
Foundation (ASF), sponsored by the Apache Incubator.
+</font></sub></p>
+</div></p>
+
+                                
+                                </section>
+                            
+    </div>
+    <div class="search-results">
+        <div class="has-results">
+            
+            <h1 class="search-results-title"><span 
class='search-results-count'></span> results matching "<span 
class='search-query'></span>"</h1>
+            <ul class="search-results-list"></ul>
+            
+        </div>
+        <div class="no-results">
+            
+            <h1 class="search-results-title">No results matching "<span 
class='search-query'></span>"</h1>
+            
+        </div>
+    </div>
+</div>
+
+                        </div>
+                    </div>
+                
+            </div>
+
+            
+
+        
+    </div>
+
+    <script>
+        var gitbook = gitbook || [];
+        gitbook.push(function() {
+            gitbook.page.hasChanged({"page":{"title":"Probabilistic Latent 
Semantic Analysis","level":"10.2","depth":1,"next":{"title":"Lat/Lon 
functions","level":"11.1","depth":1,"path":"geospatial/latlon.md","ref":"geospatial/latlon.md","articles":[]},"previous":{"title":"Latent
 Dirichlet 
Allocation","level":"10.1","depth":1,"path":"clustering/lda.md","ref":"clustering/lda.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","sitemap","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"header":1,"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.com/apache/incubator-hivemall/"},"splitter";
 
:{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"sitemap":{"hostname":"http://hivemall.incubator.apache.org/"},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":true},"anchorjs":{"selector":"h1,h2,h3,*:n
 ot(.callout) > 
h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall
 User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> 
Home":"http://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User
 Manual for Apache 
Hivemall"},"file":{"path":"clustering/plsa.md","mtime":"2017-04-27T13:43:36.000Z","type":"markdown"},"gitbook":{"version":"3.2.2","time":"2017-04-27T13:49:22.144Z"},"basePath":"..","book":{"language":""}});
+        });
+    </script>
+</div>
+
+        
+    <script src="../gitbook/gitbook.js"></script>
+    <script src="../gitbook/theme.js"></script>
+    
+        
+        <script src="../gitbook/gitbook-plugin-edit-link/plugin.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-github/plugin.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-splitter/splitter.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-etoc/plugin.js"></script>
+        
+    
+        
+        <script 
src="../gitbook/gitbook-plugin-toggle-chapters/toggle.js"></script>
+        
+    
+        
+        <script 
src="https://cdnjs.cloudflare.com/ajax/libs/anchor-js/3.1.1/anchor.min.js";></script>
+        
+    
+        
+        <script 
src="../gitbook/gitbook-plugin-anchorjs/anchor-style.js"></script>
+        
+    
+        
+        <script 
src="../gitbook/gitbook-plugin-expandable-chapters/expandable-chapters.js"></script>
+        
+    
+        
+        <script 
src="../gitbook/gitbook-plugin-search/search-engine.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-search/search.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-lunr/lunr.min.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-lunr/search-lunr.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-sharing/buttons.js"></script>
+        
+    
+        
+        <script 
src="../gitbook/gitbook-plugin-fontsettings/fontsettings.js"></script>
+        
+    
+        
+        <script 
src="../gitbook/gitbook-plugin-theme-api/theme-api.js"></script>
+        
+    
+
+    </body>
+</html>
+


Reply via email to