[1/2] incubator-hivemall git commit: Close #71: [HIVEMALL-74] Implement pLSA

myui Thu, 27 Apr 2017 06:46:04 -0700

Repository: incubator-hivemall
Updated Branches:
  refs/heads/master bffd2c78d -> f2bf3a72b



http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f2bf3a72/core/src/test/java/hivemall/topicmodel/PLSAPredictUDAFTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/hivemall/topicmodel/PLSAPredictUDAFTest.java 
b/core/src/test/java/hivemall/topicmodel/PLSAPredictUDAFTest.java
new file mode 100644
index 0000000..456dd1d
--- /dev/null
+++ b/core/src/test/java/hivemall/topicmodel/PLSAPredictUDAFTest.java
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.topicmodel;
+
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
+import org.apache.hadoop.hive.ql.udf.generic.SimpleGenericUDAFParameterInfo;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+
+import java.util.ArrayList;
+import java.util.Map;
+import java.util.HashMap;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+public class PLSAPredictUDAFTest {
+    PLSAPredictUDAF udaf;
+    GenericUDAFEvaluator evaluator;
+    ObjectInspector[] inputOIs;
+    ObjectInspector[] partialOI;
+    PLSAPredictUDAF.PLSAPredictAggregationBuffer agg;
+
+    String[] words;
+    int[] labels;
+    float[] probs;
+
+    @Test(expected = UDFArgumentException.class)
+    public void testWithoutOption() throws Exception {
+        udaf = new PLSAPredictUDAF();
+
+        inputOIs = new ObjectInspector[] {
+                
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.STRING),
+                
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.FLOAT),
+                
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.INT),
+                
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.FLOAT)};
+
+        evaluator = udaf.getEvaluator(new 
SimpleGenericUDAFParameterInfo(inputOIs, false, false));
+
+        evaluator.init(GenericUDAFEvaluator.Mode.PARTIAL1, inputOIs);
+    }
+
+    @Test(expected = UDFArgumentException.class)
+    public void testWithoutTopicOption() throws Exception {
+        udaf = new PLSAPredictUDAF();
+
+        inputOIs = new ObjectInspector[] {
+                
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.STRING),
+                
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.FLOAT),
+                
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.INT),
+                
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.FLOAT),
+                ObjectInspectorUtils.getConstantObjectInspector(
+                    PrimitiveObjectInspectorFactory.javaStringObjectInspector, 
"-alpha 0.1")};
+
+        evaluator = udaf.getEvaluator(new 
SimpleGenericUDAFParameterInfo(inputOIs, false, false));
+
+        evaluator.init(GenericUDAFEvaluator.Mode.PARTIAL1, inputOIs);
+    }
+
+    @Before
+    public void setUp() throws Exception {
+        udaf = new PLSAPredictUDAF();
+
+        inputOIs = new ObjectInspector[] {
+                
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.STRING),
+                
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.FLOAT),
+                
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.INT),
+                
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.FLOAT),
+                ObjectInspectorUtils.getConstantObjectInspector(
+                    PrimitiveObjectInspectorFactory.javaStringObjectInspector, 
"-topics 2")};
+
+        evaluator = udaf.getEvaluator(new 
SimpleGenericUDAFParameterInfo(inputOIs, false, false));
+
+        ArrayList<String> fieldNames = new ArrayList<String>();
+        ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
+
+        fieldNames.add("wcList");
+        
fieldOIs.add(ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector));
+
+        fieldNames.add("probMap");
+        fieldOIs.add(ObjectInspectorFactory.getStandardMapObjectInspector(
+            PrimitiveObjectInspectorFactory.javaStringObjectInspector,
+            
ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaFloatObjectInspector)));
+
+        fieldNames.add("topics");
+        
fieldOIs.add(PrimitiveObjectInspectorFactory.writableIntObjectInspector);
+
+        fieldNames.add("alpha");
+        
fieldOIs.add(PrimitiveObjectInspectorFactory.writableFloatObjectInspector);
+
+        fieldNames.add("delta");
+        
fieldOIs.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector);
+
+        partialOI = new ObjectInspector[4];
+        partialOI[0] = 
ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
+
+        agg = (PLSAPredictUDAF.PLSAPredictAggregationBuffer) 
evaluator.getNewAggregationBuffer();
+
+        words = new String[] {"fruits", "vegetables", "healthy", "flu", 
"apples", "oranges",
+                "like", "avocados", "colds", "colds", "avocados", "oranges", 
"like", "apples",
+                "flu", "healthy", "vegetables", "fruits"};
+        labels = new int[] {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 
1};
+        probs = new float[] {0.3339331f, 0.3324783f, 0.33209667f, 
3.2804057E-4f, 3.0303953E-4f,
+                2.4860457E-4f, 2.41481E-4f, 2.3554532E-4f, 1.352576E-4f, 
0.1660153f, 0.16596903f,
+                0.1659654f, 0.1659627f, 0.16593699f, 0.1659259f, 
0.0017611005f, 0.0015791848f,
+                8.84464E-4f};
+    }
+
+    @Test
+    public void test() throws Exception {
+        final Map<String, Float> doc1 = new HashMap<String, Float>();
+        doc1.put("fruits", 1.f);
+        doc1.put("healthy", 1.f);
+        doc1.put("vegetables", 1.f);
+
+        evaluator.init(GenericUDAFEvaluator.Mode.PARTIAL1, inputOIs);
+        evaluator.reset(agg);
+
+        for (int i = 0; i < words.length; i++) {
+            String word = words[i];
+            evaluator.iterate(agg, new Object[] {word, doc1.get(word), 
labels[i], probs[i]});
+        }
+        float[] doc1Distr = agg.get();
+
+        final Map<String, Float> doc2 = new HashMap<String, Float>();
+        doc2.put("apples", 1.f);
+        doc2.put("avocados", 1.f);
+        doc2.put("colds", 1.f);
+        doc2.put("flu", 1.f);
+        doc2.put("like", 2.f);
+        doc2.put("oranges", 1.f);
+
+        evaluator.init(GenericUDAFEvaluator.Mode.PARTIAL1, inputOIs);
+        evaluator.reset(agg);
+        for (int i = 0; i < words.length; i++) {
+            String word = words[i];
+            evaluator.iterate(agg, new Object[] {word, doc2.get(word), 
labels[i], probs[i]});
+        }
+        float[] doc2Distr = agg.get();
+
+        Assert.assertTrue(doc1Distr[0] > doc2Distr[0]);
+        Assert.assertTrue(doc1Distr[1] < doc2Distr[1]);
+    }
+
+    @Test
+    public void testMerge() throws Exception {
+        final Map<String, Float> doc = new HashMap<String, Float>();
+        doc.put("apples", 1.f);
+        doc.put("avocados", 1.f);
+        doc.put("colds", 1.f);
+        doc.put("flu", 1.f);
+        doc.put("like", 2.f);
+        doc.put("oranges", 1.f);
+
+        Object[] partials = new Object[3];
+
+        // bin #1
+        evaluator.init(GenericUDAFEvaluator.Mode.PARTIAL1, inputOIs);
+        evaluator.reset(agg);
+        for (int i = 0; i < 6; i++) {
+            evaluator.iterate(agg, new Object[] {words[i], doc.get(words[i]), 
labels[i], probs[i]});
+        }
+        partials[0] = evaluator.terminatePartial(agg);
+
+        // bin #2
+        evaluator.init(GenericUDAFEvaluator.Mode.PARTIAL1, inputOIs);
+        evaluator.reset(agg);
+        for (int i = 6; i < 12; i++) {
+            evaluator.iterate(agg, new Object[] {words[i], doc.get(words[i]), 
labels[i], probs[i]});
+        }
+        partials[1] = evaluator.terminatePartial(agg);
+
+        // bin #3
+        evaluator.init(GenericUDAFEvaluator.Mode.PARTIAL1, inputOIs);
+        evaluator.reset(agg);
+        for (int i = 12; i < 18; i++) {
+            evaluator.iterate(agg, new Object[] {words[i], doc.get(words[i]), 
labels[i], probs[i]});
+        }
+
+        partials[2] = evaluator.terminatePartial(agg);
+
+        // merge in a different order
+        final int[][] orders = new int[][] { {0, 1, 2}, {1, 0, 2}, {1, 2, 0}, 
{2, 1, 0}};
+        for (int i = 0; i < orders.length; i++) {
+            evaluator.init(GenericUDAFEvaluator.Mode.PARTIAL2, partialOI);
+            evaluator.reset(agg);
+
+            evaluator.merge(agg, partials[orders[i][0]]);
+            evaluator.merge(agg, partials[orders[i][1]]);
+            evaluator.merge(agg, partials[orders[i][2]]);
+
+            float[] distr = agg.get();
+            Assert.assertTrue(distr[0] < distr[1]);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f2bf3a72/core/src/test/java/hivemall/topicmodel/PLSAUDTFTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/hivemall/topicmodel/PLSAUDTFTest.java 
b/core/src/test/java/hivemall/topicmodel/PLSAUDTFTest.java
new file mode 100644
index 0000000..76795bc
--- /dev/null
+++ b/core/src/test/java/hivemall/topicmodel/PLSAUDTFTest.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.topicmodel;
+
+import java.util.List;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.Arrays;
+
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class PLSAUDTFTest {
+    private static final boolean DEBUG = false;
+
+    @Test
+    public void test() throws HiveException {
+        PLSAUDTF udtf = new PLSAUDTF();
+
+        ObjectInspector[] argOIs = new ObjectInspector[] {
+                
ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector),
+                ObjectInspectorUtils.getConstantObjectInspector(
+                    PrimitiveObjectInspectorFactory.javaStringObjectInspector,
+                    "-topics 2 -alpha 0.1 -delta 0.00001")};
+
+        udtf.initialize(argOIs);
+
+        String[] doc1 = new String[] {"fruits:1", "healthy:1", "vegetables:1"};
+        String[] doc2 = new String[] {"apples:1", "avocados:1", "colds:1", 
"flu:1", "like:2",
+                "oranges:1"};
+        for (int it = 0; it < 10000; it++) {
+            udtf.process(new Object[] {Arrays.asList(doc1)});
+            udtf.process(new Object[] {Arrays.asList(doc2)});
+        }
+
+        SortedMap<Float, List<String>> topicWords;
+
+        println("Topic 0:");
+        println("========");
+        topicWords = udtf.getTopicWords(0);
+        for (Map.Entry<Float, List<String>> e : topicWords.entrySet()) {
+            List<String> words = e.getValue();
+            for (int i = 0; i < words.size(); i++) {
+                println(e.getKey() + " " + words.get(i));
+            }
+        }
+        println("========");
+
+        println("Topic 1:");
+        println("========");
+        topicWords = udtf.getTopicWords(1);
+        for (Map.Entry<Float, List<String>> e : topicWords.entrySet()) {
+            List<String> words = e.getValue();
+            for (int i = 0; i < words.size(); i++) {
+                println(e.getKey() + " " + words.get(i));
+            }
+        }
+        println("========");
+
+        int k1, k2;
+        float[] topicDistr = udtf.getTopicDistribution(doc1);
+        if (topicDistr[0] > topicDistr[1]) {
+            // topic 0 MUST represent doc#1
+            k1 = 0;
+            k2 = 1;
+        } else {
+            k1 = 1;
+            k2 = 0;
+        }
+
+        Assert.assertTrue("doc1 is in topic " + k1 + " (" + (topicDistr[k1] * 
100) + "%), "
+                + "and `vegetables` SHOULD be more suitable topic word than 
`flu` in the topic",
+            udtf.getProbability("vegetables", k1) > udtf.getProbability("flu", 
k1));
+        Assert.assertTrue("doc2 is in topic " + k2 + " (" + (topicDistr[k2] * 
100) + "%), "
+                + "and `avocados` SHOULD be more suitable topic word than 
`healthy` in the topic",
+            udtf.getProbability("avocados", k2) > 
udtf.getProbability("healthy", k2));
+    }
+
+    private static void println(String msg) {
+        if (DEBUG) {
+            System.out.println(msg);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f2bf3a72/docs/gitbook/SUMMARY.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/SUMMARY.md b/docs/gitbook/SUMMARY.md
index 695119a..3d035d7 100644
--- a/docs/gitbook/SUMMARY.md
+++ b/docs/gitbook/SUMMARY.md
@@ -153,6 +153,7 @@
 ## Part X - Clustering
 
 * [Latent Dirichlet Allocation](clustering/lda.md)
+* [Probabilistic Latent Semantic Analysis](clustering/plsa.md)
 
 ## Part XI - GeoSpatial functions
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f2bf3a72/docs/gitbook/clustering/plsa.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/clustering/plsa.md b/docs/gitbook/clustering/plsa.md
new file mode 100644
index 0000000..456dfe7
--- /dev/null
+++ b/docs/gitbook/clustering/plsa.md
@@ -0,0 +1,154 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+As described in [our user guide for Latent Dirichlet Allocation 
(LDA)](lda.md), Hivemall enables you to apply clustering for your data based on 
a topic modeling technique. While LDA is one of the most popular techniques, 
there is another approach named **Probabilistic Latent Semantic Analysis** 
(pLSA). In fact, pLSA is the predecessor of LDA, but it has an advantage in 
terms of running time.
+
+- T. Hofmann. [Probabilistic Latent Semantic 
Indexing](http://dl.acm.org/citation.cfm?id=312649). SIGIR 1999, pp. 50-57.
+- T. Hofmann. [Probabilistic Latent Semantic 
Analysis](http://www.iro.umontreal.ca/~nie/IFT6255/Hofmann-UAI99.pdf). UAI 
1999, pp. 289-296.
+
+In order to efficiently handle large-scale data, our pLSA implementation is 
based on the following incremental variant of the original pLSA algorithm:
+
+- H. Wu, et al. [Incremental Probabilistic Latent Semantic Analysis for 
Automatic Question Recommendation](http://dl.acm.org/citation.cfm?id=1454026). 
RecSys 2008, pp. 99-106.
+
+<!-- toc -->
+
+> #### Note
+> This feature is supported from Hivemall v0.5-rc.1 or later.
+
+# Usage
+
+Basically, you can use our pLSA function in a similar way to LDA.
+
+In particular, we have two pLSA functions, `train_plsa()` and 
`plsa_predict()`. These functions can be used almost interchangeably with 
`train_lda()` and `lda_predict()`. Thus, reading [our user guide for 
LDA](lda.md) should be helpful before trying pLSA.
+
+In short, for the sample `docs` table we introduced in the LDA tutorial:
+
+| docid | doc  |
+|:---:|:---|
+| 1  | "Fruits and vegetables are healthy." |
+|2 | "I like apples, oranges, and avocados. I do not like the flu or colds." |
+| ... | ... |
+
+a pLSA model can be built as follows:
+
+```sql
+with word_counts as (
+  select
+    docid,
+    feature(word, count(word)) as f
+  from docs t1 lateral view explode(tokenize(doc, true)) t2 as word
+  where
+    not is_stopword(word)
+  group by
+    docid, word
+)
+select
+       train_plsa(feature, "-topics 2 -eps 0.00001 -iter 2048 -alpha 0.01") as 
(label, word, prob)
+from (
+  select docid, collect_set(f) as feature
+  from word_counts
+  group by docid
+) t
+;
+```
+
+|label |  word  |  prob|
+|:---:|:---:|:---:|
+|0|       like   | 0.28549945|
+|0|       colds  | 0.14294468|
+|0|       apples | 0.14291435|
+|0|       avocados|        0.1428958|
+|0|       flu    | 0.14287639|
+|0|       oranges| 0.1428691|
+|0|       healthy| 1.2605103E-7|
+|0|       fruits | 4.772253E-8|
+|0|       vegetables |     1.929087E-8|
+|1|       vegetables  |    0.32713377|
+|1|       fruits | 0.32713372|
+|1|       healthy| 0.3271335|
+|1|       like   | 0.006977764|
+|1|       oranges| 0.0025642214|
+|1|       flu    | 0.002507711|
+|1|       avocados|        0.0023572792|
+|1|       apples | 0.002213457|
+|1|       colds  | 0.001978546|
+
+
+
+And prediction can be done as:
+
+```sql
+test as (
+  select
+    docid,
+    word,
+    count(word) as value
+  from docs t1 LATERAL VIEW explode(tokenize(doc, true)) t2 as word
+  where
+    not is_stopword(word)
+  group by
+    docid, word
+),
+topic as (
+  select
+    t.docid,
+    plsa_predict(t.word, t.value, m.label, m.prob, "-topics 2") as 
probabilities
+  from
+    test t
+    JOIN plsa_model m ON (t.word = m.word)
+  group by
+    t.docid
+)
+select docid, probabilities, probabilities[0].label, m.words -- topic each 
document should be assigned
+from topic t
+join (
+  select label, collect_set(feature(word, prob)) as words
+  from plsa_model
+  group by label
+) m on t.probabilities[0].label = m.label
+;
+```
+
+
+|docid  | probabilities |  label |  m.words |
+|:---:|:---|:---:|:---|
+|1      | 
[{"label":1,"probability":0.72298235},{"label":0,"probability":0.27701768}]   | 
 1 |      
["vegetables:0.32713377","fruits:0.32713372","healthy:0.3271335","like:0.006977764","oranges:0.0025642214","flu:0.002507711","avocados:0.0023572792","apples:0.002213457","colds:0.001978546"]|
+|2  |     
[{"label":0,"probability":0.7052526},{"label":1,"probability":0.2947474}]     | 
 0     |  
["like:0.28549945","colds:0.14294468","apples:0.14291435","avocados:0.1428958","flu:0.14287639","oranges:0.1428691","healthy:1.2605103E-7","fruits:4.772253E-8","vegetables:1.929087E-8"]|
+
+# Difference with LDA
+
+The main advantage of using pLSA is its efficiency. Since mathematical 
formulation and optimization logic is much simpler than LDA, using pLSA 
generally requires much shorter running time.
+
+In terms of accuracy, LDA could be better than pLSA. For example, a word 
`like` appears twice in the above sample document#2 gets larger probabilities 
both in topic#1 and #2, even though one document does not contain the word. By 
contrast, LDA results (i.e., *lambda* values) are more clearly separated as 
shown in [the LDA page](lda.md). Thus, a pLSA model is likely to be biased.
+
+For the reasons that we mentioned above, we recommend you to first use LDA. 
After that, if you encountered problems such as slow running time and 
undesirable clustering results, let you try alternative pLSA approach.
+
+# Setting hyper-parameter `alpha`
+
+For training pLSA, we set a hyper-parameter `alpha` in the above example:
+
+```sql
+SELECT train_plsa(feature, "-topics 2 -eps 0.00001 -iter 2048 -alpha 0.01") 
+```
+
+This value controls **how much iterative model update is affected by the old 
results**.
+
+From an algorithmic point of view, training pLSA (and LDA) iteratively repeats 
certain operations and updates the target value (i.e., probability obtained as 
a result of `train_plsa()`). This iterative procedure gradually makes the 
probabilities more accurate. What `alpha` does is to control the degree of the 
change of probabilities in each step.
+
+Normally, `alpha` is set to a small value from 0.0 to 0.5 (default is 0.5).
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f2bf3a72/resources/ddl/define-all-as-permanent.hive
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all-as-permanent.hive 
b/resources/ddl/define-all-as-permanent.hive
index 435466d..425d8ff 100644
--- a/resources/ddl/define-all-as-permanent.hive
+++ b/resources/ddl/define-all-as-permanent.hive
@@ -626,6 +626,12 @@ CREATE FUNCTION train_lda as 'hivemall.topicmodel.LDAUDTF' 
USING JAR '${hivemall
 DROP FUNCTION IF EXISTS lda_predict;
 CREATE FUNCTION lda_predict as 'hivemall.topicmodel.LDAPredictUDAF' USING JAR 
'${hivemall_jar}';
 
+DROP FUNCTION IF EXISTS train_plsa;
+CREATE FUNCTION train_plsa as 'hivemall.topicmodel.PLSAUDTF' USING JAR 
'${hivemall_jar}';
+
+DROP FUNCTION IF EXISTS plsa_predict;
+CREATE FUNCTION plsa_predict as 'hivemall.topicmodel.PLSAPredictUDAF' USING 
JAR '${hivemall_jar}';
+
 ---------------------------
 -- Geo-Spatial functions --
 ---------------------------

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f2bf3a72/resources/ddl/define-all.hive
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive
index 8982ef4..d283812 100644
--- a/resources/ddl/define-all.hive
+++ b/resources/ddl/define-all.hive
@@ -622,6 +622,12 @@ create temporary function train_lda as 
'hivemall.topicmodel.LDAUDTF';
 drop temporary function if exists lda_predict;
 create temporary function lda_predict as 'hivemall.topicmodel.LDAPredictUDAF';
 
+drop temporary function if exists train_plsa;
+create temporary function train_plsa as 'hivemall.topicmodel.PLSAUDTF';
+
+drop temporary function if exists plsa_predict;
+create temporary function plsa_predict as 
'hivemall.topicmodel.PLSAPredictUDAF';
+
 ---------------------------
 -- Geo-Spatial functions --
 ---------------------------

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f2bf3a72/resources/ddl/define-all.spark
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark
index a6473db..1b90c9b 100644
--- a/resources/ddl/define-all.spark
+++ b/resources/ddl/define-all.spark
@@ -606,6 +606,12 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION train_lda AS 
'hivemall.topicmodel.LDAU
 sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS lda_predict")
 sqlContext.sql("CREATE TEMPORARY FUNCTION lda_predict AS 
'hivemall.topicmodel.LDAPredictUDAF'")
 
+sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS train_plsa")
+sqlContext.sql("CREATE TEMPORARY FUNCTION train_plsa AS 
'hivemall.topicmodel.PLSAUDTF'")
+
+sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS plsa_predict")
+sqlContext.sql("CREATE TEMPORARY FUNCTION plsa_predict AS 
'hivemall.topicmodel.PLSAPredictUDAF'")
+
 /**
  * Geo Spatial Functions
  */

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f2bf3a72/resources/ddl/define-udfs.td.hql
----------------------------------------------------------------------
diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql
index a2e5838..e549649 100644
--- a/resources/ddl/define-udfs.td.hql
+++ b/resources/ddl/define-udfs.td.hql
@@ -160,6 +160,8 @@ create temporary function changefinder as 
'hivemall.anomaly.ChangeFinderUDF';
 create temporary function sst as 
'hivemall.anomaly.SingularSpectrumTransformUDF';
 create temporary function train_lda as 'hivemall.topicmodel.LDAUDTF';
 create temporary function lda_predict as 'hivemall.topicmodel.LDAPredictUDAF';
+create temporary function train_plsa as 'hivemall.topicmodel.PLSAUDTF';
+create temporary function plsa_predict as 
'hivemall.topicmodel.PLSAPredictUDAF';
 create temporary function tile as 'hivemall.geospatial.TileUDF';
 create temporary function map_url as 'hivemall.geospatial.MapURLUDF';

[1/2] incubator-hivemall git commit: Close #71: [HIVEMALL-74] Implement pLSA

Reply via email to