Repository: incubator-hivemall Updated Branches: refs/heads/master bffd2c78d -> f2bf3a72b
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f2bf3a72/core/src/test/java/hivemall/topicmodel/PLSAPredictUDAFTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/hivemall/topicmodel/PLSAPredictUDAFTest.java b/core/src/test/java/hivemall/topicmodel/PLSAPredictUDAFTest.java new file mode 100644 index 0000000..456dd1d --- /dev/null +++ b/core/src/test/java/hivemall/topicmodel/PLSAPredictUDAFTest.java @@ -0,0 +1,217 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.topicmodel; + +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; +import org.apache.hadoop.hive.ql.udf.generic.SimpleGenericUDAFParameterInfo; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; + +import java.util.ArrayList; +import java.util.Map; +import java.util.HashMap; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class PLSAPredictUDAFTest { + PLSAPredictUDAF udaf; + GenericUDAFEvaluator evaluator; + ObjectInspector[] inputOIs; + ObjectInspector[] partialOI; + PLSAPredictUDAF.PLSAPredictAggregationBuffer agg; + + String[] words; + int[] labels; + float[] probs; + + @Test(expected = UDFArgumentException.class) + public void testWithoutOption() throws Exception { + udaf = new PLSAPredictUDAF(); + + inputOIs = new ObjectInspector[] { + PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.STRING), + PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.FLOAT), + PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.INT), + PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.FLOAT)}; + + evaluator = udaf.getEvaluator(new SimpleGenericUDAFParameterInfo(inputOIs, false, false)); + + evaluator.init(GenericUDAFEvaluator.Mode.PARTIAL1, inputOIs); + } + + @Test(expected = UDFArgumentException.class) + public void testWithoutTopicOption() throws Exception { + udaf = new PLSAPredictUDAF(); + + inputOIs = new ObjectInspector[] { + PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.STRING), + PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.FLOAT), + PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.INT), + PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.FLOAT), + ObjectInspectorUtils.getConstantObjectInspector( + PrimitiveObjectInspectorFactory.javaStringObjectInspector, "-alpha 0.1")}; + + evaluator = udaf.getEvaluator(new SimpleGenericUDAFParameterInfo(inputOIs, false, false)); + + evaluator.init(GenericUDAFEvaluator.Mode.PARTIAL1, inputOIs); + } + + @Before + public void setUp() throws Exception { + udaf = new PLSAPredictUDAF(); + + inputOIs = new ObjectInspector[] { + PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.STRING), + PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.FLOAT), + PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.INT), + PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.FLOAT), + ObjectInspectorUtils.getConstantObjectInspector( + PrimitiveObjectInspectorFactory.javaStringObjectInspector, "-topics 2")}; + + evaluator = udaf.getEvaluator(new SimpleGenericUDAFParameterInfo(inputOIs, false, false)); + + ArrayList<String> fieldNames = new ArrayList<String>(); + ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>(); + + fieldNames.add("wcList"); + fieldOIs.add(ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector)); + + fieldNames.add("probMap"); + fieldOIs.add(ObjectInspectorFactory.getStandardMapObjectInspector( + PrimitiveObjectInspectorFactory.javaStringObjectInspector, + ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaFloatObjectInspector))); + + fieldNames.add("topics"); + fieldOIs.add(PrimitiveObjectInspectorFactory.writableIntObjectInspector); + + fieldNames.add("alpha"); + fieldOIs.add(PrimitiveObjectInspectorFactory.writableFloatObjectInspector); + + fieldNames.add("delta"); + fieldOIs.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); + + partialOI = new ObjectInspector[4]; + partialOI[0] = ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs); + + agg = (PLSAPredictUDAF.PLSAPredictAggregationBuffer) evaluator.getNewAggregationBuffer(); + + words = new String[] {"fruits", "vegetables", "healthy", "flu", "apples", "oranges", + "like", "avocados", "colds", "colds", "avocados", "oranges", "like", "apples", + "flu", "healthy", "vegetables", "fruits"}; + labels = new int[] {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + probs = new float[] {0.3339331f, 0.3324783f, 0.33209667f, 3.2804057E-4f, 3.0303953E-4f, + 2.4860457E-4f, 2.41481E-4f, 2.3554532E-4f, 1.352576E-4f, 0.1660153f, 0.16596903f, + 0.1659654f, 0.1659627f, 0.16593699f, 0.1659259f, 0.0017611005f, 0.0015791848f, + 8.84464E-4f}; + } + + @Test + public void test() throws Exception { + final Map<String, Float> doc1 = new HashMap<String, Float>(); + doc1.put("fruits", 1.f); + doc1.put("healthy", 1.f); + doc1.put("vegetables", 1.f); + + evaluator.init(GenericUDAFEvaluator.Mode.PARTIAL1, inputOIs); + evaluator.reset(agg); + + for (int i = 0; i < words.length; i++) { + String word = words[i]; + evaluator.iterate(agg, new Object[] {word, doc1.get(word), labels[i], probs[i]}); + } + float[] doc1Distr = agg.get(); + + final Map<String, Float> doc2 = new HashMap<String, Float>(); + doc2.put("apples", 1.f); + doc2.put("avocados", 1.f); + doc2.put("colds", 1.f); + doc2.put("flu", 1.f); + doc2.put("like", 2.f); + doc2.put("oranges", 1.f); + + evaluator.init(GenericUDAFEvaluator.Mode.PARTIAL1, inputOIs); + evaluator.reset(agg); + for (int i = 0; i < words.length; i++) { + String word = words[i]; + evaluator.iterate(agg, new Object[] {word, doc2.get(word), labels[i], probs[i]}); + } + float[] doc2Distr = agg.get(); + + Assert.assertTrue(doc1Distr[0] > doc2Distr[0]); + Assert.assertTrue(doc1Distr[1] < doc2Distr[1]); + } + + @Test + public void testMerge() throws Exception { + final Map<String, Float> doc = new HashMap<String, Float>(); + doc.put("apples", 1.f); + doc.put("avocados", 1.f); + doc.put("colds", 1.f); + doc.put("flu", 1.f); + doc.put("like", 2.f); + doc.put("oranges", 1.f); + + Object[] partials = new Object[3]; + + // bin #1 + evaluator.init(GenericUDAFEvaluator.Mode.PARTIAL1, inputOIs); + evaluator.reset(agg); + for (int i = 0; i < 6; i++) { + evaluator.iterate(agg, new Object[] {words[i], doc.get(words[i]), labels[i], probs[i]}); + } + partials[0] = evaluator.terminatePartial(agg); + + // bin #2 + evaluator.init(GenericUDAFEvaluator.Mode.PARTIAL1, inputOIs); + evaluator.reset(agg); + for (int i = 6; i < 12; i++) { + evaluator.iterate(agg, new Object[] {words[i], doc.get(words[i]), labels[i], probs[i]}); + } + partials[1] = evaluator.terminatePartial(agg); + + // bin #3 + evaluator.init(GenericUDAFEvaluator.Mode.PARTIAL1, inputOIs); + evaluator.reset(agg); + for (int i = 12; i < 18; i++) { + evaluator.iterate(agg, new Object[] {words[i], doc.get(words[i]), labels[i], probs[i]}); + } + + partials[2] = evaluator.terminatePartial(agg); + + // merge in a different order + final int[][] orders = new int[][] { {0, 1, 2}, {1, 0, 2}, {1, 2, 0}, {2, 1, 0}}; + for (int i = 0; i < orders.length; i++) { + evaluator.init(GenericUDAFEvaluator.Mode.PARTIAL2, partialOI); + evaluator.reset(agg); + + evaluator.merge(agg, partials[orders[i][0]]); + evaluator.merge(agg, partials[orders[i][1]]); + evaluator.merge(agg, partials[orders[i][2]]); + + float[] distr = agg.get(); + Assert.assertTrue(distr[0] < distr[1]); + } + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f2bf3a72/core/src/test/java/hivemall/topicmodel/PLSAUDTFTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/hivemall/topicmodel/PLSAUDTFTest.java b/core/src/test/java/hivemall/topicmodel/PLSAUDTFTest.java new file mode 100644 index 0000000..76795bc --- /dev/null +++ b/core/src/test/java/hivemall/topicmodel/PLSAUDTFTest.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.topicmodel; + +import java.util.List; +import java.util.Map; +import java.util.SortedMap; +import java.util.Arrays; + +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; + +import org.junit.Assert; +import org.junit.Test; + +public class PLSAUDTFTest { + private static final boolean DEBUG = false; + + @Test + public void test() throws HiveException { + PLSAUDTF udtf = new PLSAUDTF(); + + ObjectInspector[] argOIs = new ObjectInspector[] { + ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector), + ObjectInspectorUtils.getConstantObjectInspector( + PrimitiveObjectInspectorFactory.javaStringObjectInspector, + "-topics 2 -alpha 0.1 -delta 0.00001")}; + + udtf.initialize(argOIs); + + String[] doc1 = new String[] {"fruits:1", "healthy:1", "vegetables:1"}; + String[] doc2 = new String[] {"apples:1", "avocados:1", "colds:1", "flu:1", "like:2", + "oranges:1"}; + for (int it = 0; it < 10000; it++) { + udtf.process(new Object[] {Arrays.asList(doc1)}); + udtf.process(new Object[] {Arrays.asList(doc2)}); + } + + SortedMap<Float, List<String>> topicWords; + + println("Topic 0:"); + println("========"); + topicWords = udtf.getTopicWords(0); + for (Map.Entry<Float, List<String>> e : topicWords.entrySet()) { + List<String> words = e.getValue(); + for (int i = 0; i < words.size(); i++) { + println(e.getKey() + " " + words.get(i)); + } + } + println("========"); + + println("Topic 1:"); + println("========"); + topicWords = udtf.getTopicWords(1); + for (Map.Entry<Float, List<String>> e : topicWords.entrySet()) { + List<String> words = e.getValue(); + for (int i = 0; i < words.size(); i++) { + println(e.getKey() + " " + words.get(i)); + } + } + println("========"); + + int k1, k2; + float[] topicDistr = udtf.getTopicDistribution(doc1); + if (topicDistr[0] > topicDistr[1]) { + // topic 0 MUST represent doc#1 + k1 = 0; + k2 = 1; + } else { + k1 = 1; + k2 = 0; + } + + Assert.assertTrue("doc1 is in topic " + k1 + " (" + (topicDistr[k1] * 100) + "%), " + + "and `vegetables` SHOULD be more suitable topic word than `flu` in the topic", + udtf.getProbability("vegetables", k1) > udtf.getProbability("flu", k1)); + Assert.assertTrue("doc2 is in topic " + k2 + " (" + (topicDistr[k2] * 100) + "%), " + + "and `avocados` SHOULD be more suitable topic word than `healthy` in the topic", + udtf.getProbability("avocados", k2) > udtf.getProbability("healthy", k2)); + } + + private static void println(String msg) { + if (DEBUG) { + System.out.println(msg); + } + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f2bf3a72/docs/gitbook/SUMMARY.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/SUMMARY.md b/docs/gitbook/SUMMARY.md index 695119a..3d035d7 100644 --- a/docs/gitbook/SUMMARY.md +++ b/docs/gitbook/SUMMARY.md @@ -153,6 +153,7 @@ ## Part X - Clustering * [Latent Dirichlet Allocation](clustering/lda.md) +* [Probabilistic Latent Semantic Analysis](clustering/plsa.md) ## Part XI - GeoSpatial functions http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f2bf3a72/docs/gitbook/clustering/plsa.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/clustering/plsa.md b/docs/gitbook/clustering/plsa.md new file mode 100644 index 0000000..456dfe7 --- /dev/null +++ b/docs/gitbook/clustering/plsa.md @@ -0,0 +1,154 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +As described in [our user guide for Latent Dirichlet Allocation (LDA)](lda.md), Hivemall enables you to apply clustering for your data based on a topic modeling technique. While LDA is one of the most popular techniques, there is another approach named **Probabilistic Latent Semantic Analysis** (pLSA). In fact, pLSA is the predecessor of LDA, but it has an advantage in terms of running time. + +- T. Hofmann. [Probabilistic Latent Semantic Indexing](http://dl.acm.org/citation.cfm?id=312649). SIGIR 1999, pp. 50-57. +- T. Hofmann. [Probabilistic Latent Semantic Analysis](http://www.iro.umontreal.ca/~nie/IFT6255/Hofmann-UAI99.pdf). UAI 1999, pp. 289-296. + +In order to efficiently handle large-scale data, our pLSA implementation is based on the following incremental variant of the original pLSA algorithm: + +- H. Wu, et al. [Incremental Probabilistic Latent Semantic Analysis for Automatic Question Recommendation](http://dl.acm.org/citation.cfm?id=1454026). RecSys 2008, pp. 99-106. + +<!-- toc --> + +> #### Note +> This feature is supported from Hivemall v0.5-rc.1 or later. + +# Usage + +Basically, you can use our pLSA function in a similar way to LDA. + +In particular, we have two pLSA functions, `train_plsa()` and `plsa_predict()`. These functions can be used almost interchangeably with `train_lda()` and `lda_predict()`. Thus, reading [our user guide for LDA](lda.md) should be helpful before trying pLSA. + +In short, for the sample `docs` table we introduced in the LDA tutorial: + +| docid | doc | +|:---:|:---| +| 1 | "Fruits and vegetables are healthy." | +|2 | "I like apples, oranges, and avocados. I do not like the flu or colds." | +| ... | ... | + +a pLSA model can be built as follows: + +```sql +with word_counts as ( + select + docid, + feature(word, count(word)) as f + from docs t1 lateral view explode(tokenize(doc, true)) t2 as word + where + not is_stopword(word) + group by + docid, word +) +select + train_plsa(feature, "-topics 2 -eps 0.00001 -iter 2048 -alpha 0.01") as (label, word, prob) +from ( + select docid, collect_set(f) as feature + from word_counts + group by docid +) t +; +``` + +|label | word | prob| +|:---:|:---:|:---:| +|0| like | 0.28549945| +|0| colds | 0.14294468| +|0| apples | 0.14291435| +|0| avocados| 0.1428958| +|0| flu | 0.14287639| +|0| oranges| 0.1428691| +|0| healthy| 1.2605103E-7| +|0| fruits | 4.772253E-8| +|0| vegetables | 1.929087E-8| +|1| vegetables | 0.32713377| +|1| fruits | 0.32713372| +|1| healthy| 0.3271335| +|1| like | 0.006977764| +|1| oranges| 0.0025642214| +|1| flu | 0.002507711| +|1| avocados| 0.0023572792| +|1| apples | 0.002213457| +|1| colds | 0.001978546| + + + +And prediction can be done as: + +```sql +test as ( + select + docid, + word, + count(word) as value + from docs t1 LATERAL VIEW explode(tokenize(doc, true)) t2 as word + where + not is_stopword(word) + group by + docid, word +), +topic as ( + select + t.docid, + plsa_predict(t.word, t.value, m.label, m.prob, "-topics 2") as probabilities + from + test t + JOIN plsa_model m ON (t.word = m.word) + group by + t.docid +) +select docid, probabilities, probabilities[0].label, m.words -- topic each document should be assigned +from topic t +join ( + select label, collect_set(feature(word, prob)) as words + from plsa_model + group by label +) m on t.probabilities[0].label = m.label +; +``` + + +|docid | probabilities | label | m.words | +|:---:|:---|:---:|:---| +|1 | [{"label":1,"probability":0.72298235},{"label":0,"probability":0.27701768}] | 1 | ["vegetables:0.32713377","fruits:0.32713372","healthy:0.3271335","like:0.006977764","oranges:0.0025642214","flu:0.002507711","avocados:0.0023572792","apples:0.002213457","colds:0.001978546"]| +|2 | [{"label":0,"probability":0.7052526},{"label":1,"probability":0.2947474}] | 0 | ["like:0.28549945","colds:0.14294468","apples:0.14291435","avocados:0.1428958","flu:0.14287639","oranges:0.1428691","healthy:1.2605103E-7","fruits:4.772253E-8","vegetables:1.929087E-8"]| + +# Difference with LDA + +The main advantage of using pLSA is its efficiency. Since mathematical formulation and optimization logic is much simpler than LDA, using pLSA generally requires much shorter running time. + +In terms of accuracy, LDA could be better than pLSA. For example, a word `like` appears twice in the above sample document#2 gets larger probabilities both in topic#1 and #2, even though one document does not contain the word. By contrast, LDA results (i.e., *lambda* values) are more clearly separated as shown in [the LDA page](lda.md). Thus, a pLSA model is likely to be biased. + +For the reasons that we mentioned above, we recommend you to first use LDA. After that, if you encountered problems such as slow running time and undesirable clustering results, let you try alternative pLSA approach. + +# Setting hyper-parameter `alpha` + +For training pLSA, we set a hyper-parameter `alpha` in the above example: + +```sql +SELECT train_plsa(feature, "-topics 2 -eps 0.00001 -iter 2048 -alpha 0.01") +``` + +This value controls **how much iterative model update is affected by the old results**. + +From an algorithmic point of view, training pLSA (and LDA) iteratively repeats certain operations and updates the target value (i.e., probability obtained as a result of `train_plsa()`). This iterative procedure gradually makes the probabilities more accurate. What `alpha` does is to control the degree of the change of probabilities in each step. + +Normally, `alpha` is set to a small value from 0.0 to 0.5 (default is 0.5). \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f2bf3a72/resources/ddl/define-all-as-permanent.hive ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive index 435466d..425d8ff 100644 --- a/resources/ddl/define-all-as-permanent.hive +++ b/resources/ddl/define-all-as-permanent.hive @@ -626,6 +626,12 @@ CREATE FUNCTION train_lda as 'hivemall.topicmodel.LDAUDTF' USING JAR '${hivemall DROP FUNCTION IF EXISTS lda_predict; CREATE FUNCTION lda_predict as 'hivemall.topicmodel.LDAPredictUDAF' USING JAR '${hivemall_jar}'; +DROP FUNCTION IF EXISTS train_plsa; +CREATE FUNCTION train_plsa as 'hivemall.topicmodel.PLSAUDTF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS plsa_predict; +CREATE FUNCTION plsa_predict as 'hivemall.topicmodel.PLSAPredictUDAF' USING JAR '${hivemall_jar}'; + --------------------------- -- Geo-Spatial functions -- --------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f2bf3a72/resources/ddl/define-all.hive ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive index 8982ef4..d283812 100644 --- a/resources/ddl/define-all.hive +++ b/resources/ddl/define-all.hive @@ -622,6 +622,12 @@ create temporary function train_lda as 'hivemall.topicmodel.LDAUDTF'; drop temporary function if exists lda_predict; create temporary function lda_predict as 'hivemall.topicmodel.LDAPredictUDAF'; +drop temporary function if exists train_plsa; +create temporary function train_plsa as 'hivemall.topicmodel.PLSAUDTF'; + +drop temporary function if exists plsa_predict; +create temporary function plsa_predict as 'hivemall.topicmodel.PLSAPredictUDAF'; + --------------------------- -- Geo-Spatial functions -- --------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f2bf3a72/resources/ddl/define-all.spark ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark index a6473db..1b90c9b 100644 --- a/resources/ddl/define-all.spark +++ b/resources/ddl/define-all.spark @@ -606,6 +606,12 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION train_lda AS 'hivemall.topicmodel.LDAU sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS lda_predict") sqlContext.sql("CREATE TEMPORARY FUNCTION lda_predict AS 'hivemall.topicmodel.LDAPredictUDAF'") +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS train_plsa") +sqlContext.sql("CREATE TEMPORARY FUNCTION train_plsa AS 'hivemall.topicmodel.PLSAUDTF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS plsa_predict") +sqlContext.sql("CREATE TEMPORARY FUNCTION plsa_predict AS 'hivemall.topicmodel.PLSAPredictUDAF'") + /** * Geo Spatial Functions */ http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f2bf3a72/resources/ddl/define-udfs.td.hql ---------------------------------------------------------------------- diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql index a2e5838..e549649 100644 --- a/resources/ddl/define-udfs.td.hql +++ b/resources/ddl/define-udfs.td.hql @@ -160,6 +160,8 @@ create temporary function changefinder as 'hivemall.anomaly.ChangeFinderUDF'; create temporary function sst as 'hivemall.anomaly.SingularSpectrumTransformUDF'; create temporary function train_lda as 'hivemall.topicmodel.LDAUDTF'; create temporary function lda_predict as 'hivemall.topicmodel.LDAPredictUDAF'; +create temporary function train_plsa as 'hivemall.topicmodel.PLSAUDTF'; +create temporary function plsa_predict as 'hivemall.topicmodel.PLSAPredictUDAF'; create temporary function tile as 'hivemall.geospatial.TileUDF'; create temporary function map_url as 'hivemall.geospatial.MapURLUDF';
