[1/2] incubator-hivemall git commit: Close #72: [HIVEMALL-86] Updated Hadoop version dependencies from cdh3 to v2.4.0

myui Tue, 18 Apr 2017 02:48:13 -0700

Repository: incubator-hivemall
Updated Branches:
  refs/heads/master 8aae974fc -> cb16a3944



http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/core/src/test/java/hivemall/topicmodel/OnlineLDAModelTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/hivemall/topicmodel/OnlineLDAModelTest.java 
b/core/src/test/java/hivemall/topicmodel/OnlineLDAModelTest.java
new file mode 100644
index 0000000..e151943
--- /dev/null
+++ b/core/src/test/java/hivemall/topicmodel/OnlineLDAModelTest.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.topicmodel;
+
+import java.util.Map;
+import java.util.List;
+import java.util.SortedMap;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class OnlineLDAModelTest {
+    private static final boolean DEBUG = false;
+
+    @Test
+    public void test() {
+        int K = 2;
+        int it = 0;
+        float perplexityPrev;
+        float perplexity = Float.MAX_VALUE;
+
+        OnlineLDAModel model = new OnlineLDAModel(K, 1.f / K, 1.f / K, 2, 80, 
0.8, 1E-5d);
+
+        String[] doc1 = new String[] {"fruits:1", "healthy:1", "vegetables:1"};
+        String[] doc2 = new String[] {"apples:1", "avocados:1", "colds:1", 
"flu:1", "like:2", "oranges:1"};
+
+        do {
+            perplexityPrev = perplexity;
+            perplexity = 0.f;
+
+            // online (i.e., one-by-one) updating
+            model.train(new String[][] {doc1});
+            perplexity += model.computePerplexity();
+
+            model.train(new String[][] {doc2});
+            perplexity += model.computePerplexity();
+
+            perplexity /= 2.f; // mean perplexity for the 2 docs
+
+            it++;
+            println("Iteration " + it + ": mean perplexity = " + perplexity);
+        } while(Math.abs(perplexityPrev - perplexity) >= 1E-6f);
+
+        SortedMap<Float, List<String>> topicWords;
+
+        println("Topic 0:");
+        println("========");
+        topicWords = model.getTopicWords(0);
+        for (Map.Entry<Float, List<String>> e : topicWords.entrySet()) {
+            List<String> words = e.getValue();
+            for (int i = 0; i < words.size(); i++) {
+                println(e.getKey() + " " + words.get(i));
+            }
+        }
+        println("========");
+
+        println("Topic 1:");
+        println("========");
+        topicWords = model.getTopicWords(1);
+        for (Map.Entry<Float, List<String>> e : topicWords.entrySet()) {
+            List<String> words = e.getValue();
+            for (int i = 0; i < words.size(); i++) {
+                println(e.getKey() + " " + words.get(i));
+            }
+        }
+        println("========");
+
+        int k1, k2;
+        float[] topicDistr = model.getTopicDistribution(doc1);
+        if (topicDistr[0] > topicDistr[1]) {
+            // topic 0 MUST represent doc#1
+            k1 = 0;
+            k2 = 1;
+        } else {
+            k1 = 1;
+            k2 = 0;
+        }
+        Assert.assertTrue("doc1 is in topic " + k1 + " (" + (topicDistr[k1] * 
100) + "%), "
+            + "and `vegetables` SHOULD be more suitable topic word than `flu` 
in the topic",
+            model.getLambda("vegetables", k1) > model.getLambda("flu", k1));
+        Assert.assertTrue("doc2 is in topic " + k2 + " (" + (topicDistr[k2] * 
100) + "%), "
+            + "and `avocados` SHOULD be more suitable topic word than 
`healthy` in the topic",
+            model.getLambda("avocados", k2) > model.getLambda("healthy", k2));
+    }
+
+    private static void println(String msg) {
+        if (DEBUG) {
+            System.out.println(msg);
+        }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/docs/gitbook/SUMMARY.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/SUMMARY.md b/docs/gitbook/SUMMARY.md
index 4c6ed1b..78b1faa 100644
--- a/docs/gitbook/SUMMARY.md
+++ b/docs/gitbook/SUMMARY.md
@@ -150,7 +150,11 @@
 * [Change-Point Detection using Singular Spectrum Transformation 
(SST)](anomaly/sst.md)
 * [ChangeFinder: Detecting Outlier and Change-Point 
Simultaneously](anomaly/changefinder.md)
 
-## Part X - Hivemall on Spark
+## Part X - Clustering
+
+* [Latent Dirichlet Allocation](clustering/lda.md)
+
+## Part XI - Hivemall on Spark
 
 * [Getting Started](spark/getting_started/README.md)
     * [Installation](spark/getting_started/installation.md)
@@ -165,7 +169,7 @@
     * [Top-k Join processing](spark/misc/topk_join.md)
     * [Other utility functions](spark/misc/functions.md)
 
-## Part X - External References
+## Part XII - External References
 
 * [Hivemall on Apache Spark](https://github.com/maropu/hivemall-spark)
 * [Hivemall on Apache Pig](https://github.com/daijyc/hivemall/wiki/PigHome)

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/docs/gitbook/clustering/lda.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/clustering/lda.md b/docs/gitbook/clustering/lda.md
new file mode 100644
index 0000000..1998934
--- /dev/null
+++ b/docs/gitbook/clustering/lda.md
@@ -0,0 +1,170 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+Topic modeling is a way to analyze massive documents by clustering them into 
some ***topics***. In particular, **Latent Dirichlet Allocation** (LDA) is one 
of the most popular topic modeling techniques; papers introduce the method are 
as follows:
+
+- D. M. Blei, et al. [Latent Dirichlet 
Allocation](http://www.jmlr.org/papers/v3/blei03a.html). Journal of Machine 
Learning Research 3, pp. 993-1022, 2003.
+- M. D. Hoffman, et al. [Online Learning for Latent Dirichlet 
Allocation](https://papers.nips.cc/paper/3902-online-learning-for-latent-dirichlet-allocation).
 NIPS 2010.
+
+Hivemall enables you to analyze your documents based on LDA. This page gives 
usage instructions of the feature.
+
+<!-- toc -->
+
+*Note: This feature is supported from Hivemall v0.5-rc.1 or later.*
+
+# Prepare document data
+
+Assume that we already have a table `docs` which contains many documents as 
string format:
+
+| docid | doc  |
+|:---:|:---|
+| 1  | "Fruits and vegetables are healthy." |
+|2 | "I like apples, oranges, and avocados. I do not like the flu or colds." |
+| ... | ... |
+
+Hivemall has several functions which are particularly useful for text 
processing. More specifically, by using `tokenize()` and `is_stopword()`, you 
can immediately convert the documents to 
[bag-of-words](https://en.wikipedia.org/wiki/Bag-of-words_model)-like format:
+
+```sql
+select
+  docid,
+  feature(word, count(word)) as word_count
+from docs t1 LATERAL VIEW explode(tokenize(doc, true)) t2 as word
+where
+  not is_stopword(word)
+group by
+  docid, word
+;
+```
+
+| docid | word_count |
+|:---:|:---|
+|1  |     fruits:1 |
+|1  |     healthy:1|
+|1  |     vegetables:1 |
+|2  |     apples:1 |
+|2  |     avocados:1 |
+|2  |     colds:1 |
+|2   |    flu:1 |
+|2 |      like:2 |
+|2|       oranges:1 |
+
+# Building Topic Models and Finding Topic Words
+
+For each document, collecting `word_count`s in the last table creates a 
feature vector as an input to the `train_lda()` function:
+
+```sql
+with word_counts as (
+  select
+    docid,
+    feature(word, count(word)) as word_count
+  from docs t1 LATERAL VIEW explode(tokenize(doc, true)) t2 as word
+  where
+    not is_stopword(word)
+  group by
+    docid, word
+)
+select
+  train_lda(feature, "-topic 2 -iter 20") as (label, word, lambda)
+from (
+  select docid, collect_set(word_count) as feature
+  from word_counts
+  group by docid
+) t
+;
+```
+
+Here, an option `-topic 2` specifies the number of topics we assume in the set 
of documents.
+
+Eventually, a new table `lda_model` is generated as shown below:
+
+|label | word   | lambda |
+|:---:|:---:|:---:|
+|0     | fruits | 0.33372128|
+|0     | vegetables  |    0.33272517|
+|0     | healthy | 0.33246377|
+|0     | flu   |  2.3617347E-4|
+|0     | apples | 2.1898883E-4|
+|0     | oranges | 1.8161473E-4|
+|0     | like   | 1.7666373E-4|
+|0     | avocados  |      1.726186E-4|
+|0     | colds  | 1.037139E-4|
+|1     | colds  | 0.16622013|
+|1     | avocados |       0.16618845|
+|1     | oranges | 0.1661859|
+|1     | like  |  0.16618414|
+|1     | apples |  0.16616651|
+|1     | flu   |  0.16615893|
+|1     | healthy | 0.0012059759|
+|1     | vegetables  |    0.0010818697|
+|1     | fruits  | 6.080827E-4|
+
+In the table, `label` indicates a topic index, and `lambda` is a value which 
represents how each word is likely to characterize a topic. That is, we can say 
that, in terms of `lambda`, top-N words are the ***topic words*** of a topic.
+
+Obviously, we can observe that topic `0` corresponds to document `1`, and 
topic `1` represents words in document `2`.
+
+# Predicting Topic Assignments of Documents
+
+Once you have constructed topic models as described before, a function 
`lda_predict()` allows you to predict topic assignments of documents.
+
+For example, if we consider the `docs` table, the exactly same set of 
documents as used for training, probability that a document is assigned to a 
topic can be computed by:
+
+```sql
+with test as (
+  select
+    docid,
+    word,
+    count(word) as value
+  from docs t1 LATERAL VIEW explode(tokenize(doc, true)) t2 as word
+  where
+    not is_stopword(word)
+  group by
+    docid, word
+)
+select
+  t.docid,
+  lda_predict(t.word, t.value, m.label, m.lambda, "-topic 2") as probabilities
+from
+  test t
+  JOIN lda_model m ON (t.word = m.word)
+group by
+  t.docid
+;
+```
+
+| docid | probabilities (sorted by probabilities) | 
+|:---:|:---|
+|1  | [{"label":0,"probability":0.875},{"label":1,"probability":0.125}]|
+|2  | [{"label":1,"probability":0.9375},{"label":0,"probability":0.0625}]|
+
+Importantly, an option `-topic` should be set to the same value as you set for 
training.
+
+Since the probabilities are sorted in descending order, a label of the most 
promising topic is easily obtained as:
+
+```sql
+select docid, probabilities[0].label
+from topic
+;
+```
+
+| docid | label |
+|:---:|:---:|
+|  1 | 0 |
+| 2 | 1 |
+
+Of course, using the different set of documents for prediction is possible. 
Predicting topic assignments of newly observed documents should be more 
realistic scenario.

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/docs/gitbook/getting_started/installation.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/getting_started/installation.md 
b/docs/gitbook/getting_started/installation.md
index 3a3c97f..896d247 100644
--- a/docs/gitbook/getting_started/installation.md
+++ b/docs/gitbook/getting_started/installation.md
@@ -20,7 +20,8 @@
 Prerequisites
 ============
 
-* Hive v0.12 or later
+* Hadoop v2.4.0 or later
+* Hive v0.13 or later
 * Java 7 or later
 * 
[hivemall-core-xxx-with-dependencies.jar](https://github.com/myui/hivemall/releases)
 * [define-all.hive](https://github.com/myui/hivemall/releases)
@@ -41,4 +42,15 @@ This automatically loads all Hivemall functions every time 
you start a Hive sess
 $ hive
 add jar /tmp/hivemall-core-xxx-with-dependencies.jar;
 source /tmp/define-all.hive;
-```
\ No newline at end of file
+```
+
+Build from Source
+==================
+
+```sh
+$ git clone https://github.com/apache/incubator-hivemall.git
+$ cd incubator-hivemall
+$ bin/build.sh
+```
+
+Then, you can find hivemall jars in `./target`.

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/mixserv/pom.xml
----------------------------------------------------------------------
diff --git a/mixserv/pom.xml b/mixserv/pom.xml
index 41ba401..0e0e83c 100644
--- a/mixserv/pom.xml
+++ b/mixserv/pom.xml
@@ -39,8 +39,14 @@
                <!-- provided scope -->
                <dependency>
                        <groupId>org.apache.hadoop</groupId>
-                       <artifactId>hadoop-core</artifactId>
-                       <version>0.20.2-cdh3u6</version>
+                       <artifactId>hadoop-common</artifactId>
+                       <version>${hadoop.version}</version>
+                       <scope>provided</scope>
+               </dependency>
+               <dependency>
+                       <groupId>org.apache.hadoop</groupId>
+                       <artifactId>hadoop-mapreduce-client-core</artifactId>
+                       <version>${hadoop.version}</version>
                        <scope>provided</scope>
                </dependency>
                <dependency>
@@ -74,9 +80,9 @@
                        <scope>provided</scope>
                </dependency>
                <dependency>
-                       <groupId>org.apache.hadoop.thirdparty.guava</groupId>
+                       <groupId>com.google.guava</groupId>
                        <artifactId>guava</artifactId>
-                       <version>r09-jarjar</version>
+                       <version>${guava.version}</version>
                        <scope>provided</scope>
                </dependency>
 
@@ -124,7 +130,7 @@
                <dependency>
                        <groupId>junit</groupId>
                        <artifactId>junit</artifactId>
-                       <version>4.12</version>
+                       <version>${junit.version}</version>
                        <scope>test</scope>
                </dependency>
                <dependency>

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/nlp/pom.xml
----------------------------------------------------------------------
diff --git a/nlp/pom.xml b/nlp/pom.xml
index c7d2cef..b6ea409 100644
--- a/nlp/pom.xml
+++ b/nlp/pom.xml
@@ -39,8 +39,14 @@
                <!-- provided scope -->
                <dependency>
                        <groupId>org.apache.hadoop</groupId>
-                       <artifactId>hadoop-core</artifactId>
-                       <version>0.20.2-cdh3u6</version>
+                       <artifactId>hadoop-common</artifactId>
+                       <version>${hadoop.version}</version>
+                       <scope>provided</scope>
+               </dependency>
+               <dependency>
+                       <groupId>org.apache.hadoop</groupId>
+                       <artifactId>hadoop-mapreduce-client-core</artifactId>
+                       <version>${hadoop.version}</version>
                        <scope>provided</scope>
                </dependency>
                <dependency>
@@ -92,9 +98,9 @@
                        <scope>provided</scope>
                </dependency>
                <dependency>
-                       <groupId>org.apache.hadoop.thirdparty.guava</groupId>
+                       <groupId>com.google.guava</groupId>
                        <artifactId>guava</artifactId>
-                       <version>r09-jarjar</version>
+                       <version>${guava.version}</version>
                        <scope>provided</scope>
                </dependency>
                <dependency>
@@ -116,7 +122,7 @@
                <dependency>
                        <groupId>junit</groupId>
                        <artifactId>junit</artifactId>
-                       <version>4.12</version>
+                       <version>${junit.version}</version>
                        <scope>test</scope>
                </dependency>
                <dependency>

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 7743d5a..63abe87 100644
--- a/pom.xml
+++ b/pom.xml
@@ -25,7 +25,7 @@
        <version>0.4.2-rc.2</version>
 
        <name>Apache Hivemall</name>
-       <description>Scalable Machine Learning Library for Apache 
Hive</description>
+       <description>Scalable Machine Learning Library for Apache Hive, Apache 
Spark, and Apache Pig</description>
        <url>http://hivemall.incubator.apache.org/</url>
        <inceptionYear>2013</inceptionYear>
        <organization>
@@ -158,8 +158,8 @@
                        <name>Takuya Kitazawa</name>
                        <email>takuti[at]apache.org</email>
                        <url>https://github.com/takuti</url>
-                       <organization>The University of Tokyo</organization>
-                       
<organizationUrl>http://www.u-tokyo.ac.jp/</organizationUrl>
+                       <organization>Treasure Data, Inc.</organization>
+                       
<organizationUrl>https://www.treasuredata.com/</organizationUrl>
                        <roles>
                                <role>Committer</role>
                        </roles>
@@ -241,25 +241,23 @@
        </modules>
 
        <properties>
+               <java.source.version>1.7</java.source.version>
+               <java.target.version>1.7</java.target.version>
+               <scala.version>2.11.8</scala.version>
+               <scala.binary.version>2.11</scala.binary.version>
                
<maven.build.timestamp.format>yyyy</maven.build.timestamp.format>
                <build.year>${maven.build.timestamp}</build.year>
                
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
                <protobuf.version>2.5.0</protobuf.version>
                <protoc.path>${env.PROTOC_PATH}</protoc.path>
+               <hadoop.version>2.4.0</hadoop.version>
                <hive.version>0.13.0</hive.version>
-               <scala.version>2.11.8</scala.version>
+               <guava.version>11.0.2</guava.version>
+               <junit.version>4.12</junit.version>
                
<dependency.locations.enabled>false</dependency.locations.enabled>
-               <scala.binary.version>2.11</scala.binary.version>
                <main.basedir>${project.basedir}</main.basedir>
        </properties>
 
-       <repositories>
-               <repository>
-                       <id>cloudera</id>
-                       
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
-               </repository>
-       </repositories>
-
        <distributionManagement>
                <snapshotRepository>
                        <id>ossrh</id>
@@ -275,7 +273,6 @@
                                <module>spark/spark-common</module>
                        </modules>
                        <properties>
-                               <hadoop.version>2.7</hadoop.version>
                                <spark.version>2.1.0</spark.version>
                                <spark.binary.version>2.1</spark.binary.version>
                        </properties>
@@ -287,7 +284,6 @@
                                <module>spark/spark-common</module>
                        </modules>
                        <properties>
-                               <hadoop.version>2.7</hadoop.version>
                                <spark.version>2.0.2</spark.version>
                                <spark.binary.version>2.0</spark.binary.version>
                        </properties>
@@ -480,8 +476,8 @@
                                <artifactId>maven-compiler-plugin</artifactId>
                                <version>3.1</version>
                                <configuration>
-                                       <source>1.7</source>
-                                       <target>1.7</target>
+                                       <source>${java.source.version}</source>
+                                       <target>${java.target.version}</target>
                                        <debug>true</debug>
                                        
<debuglevel>lines,vars,source</debuglevel>
                                        <encoding>UTF-8</encoding>

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/resources/ddl/define-all-as-permanent.hive
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all-as-permanent.hive 
b/resources/ddl/define-all-as-permanent.hive
index c6dda03..1eb9c82 100644
--- a/resources/ddl/define-all-as-permanent.hive
+++ b/resources/ddl/define-all-as-permanent.hive
@@ -616,6 +616,16 @@ CREATE FUNCTION changefinder as 
'hivemall.anomaly.ChangeFinderUDF' USING JAR '${
 DROP FUNCTION IF EXISTS sst;
 CREATE FUNCTION sst as 'hivemall.anomaly.SingularSpectrumTransformUDF' USING 
JAR '${hivemall_jar}';
 
+--------------------
+-- Topic Modeling --
+--------------------
+
+DROP FUNCTION IF EXISTS train_lda;
+CREATE FUNCTION train_lda as 'hivemall.topicmodel.LDAUDTF' USING JAR 
'${hivemall_jar}';
+
+DROP FUNCTION IF EXISTS lda_predict;
+CREATE FUNCTION lda_predict as 'hivemall.topicmodel.LDAPredictUDAF' USING JAR 
'${hivemall_jar}';
+
 ----------------------------
 -- Smile related features --
 ----------------------------

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/resources/ddl/define-all.hive
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive
index 8ea16c1..b503546 100644
--- a/resources/ddl/define-all.hive
+++ b/resources/ddl/define-all.hive
@@ -612,6 +612,16 @@ create temporary function changefinder as 
'hivemall.anomaly.ChangeFinderUDF';
 drop temporary function if exists sst;
 create temporary function sst as 
'hivemall.anomaly.SingularSpectrumTransformUDF';
 
+--------------------
+-- Topic Modeling --
+--------------------
+
+drop temporary function if exists train_lda;
+create temporary function train_lda as 'hivemall.topicmodel.LDAUDTF';
+
+drop temporary function if exists lda_predict;
+create temporary function lda_predict as 'hivemall.topicmodel.LDAPredictUDAF';
+
 ----------------------------
 -- Smile related features --
 ----------------------------

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/resources/ddl/define-all.spark
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark
index 0172cc8..b5239cf 100644
--- a/resources/ddl/define-all.spark
+++ b/resources/ddl/define-all.spark
@@ -597,6 +597,16 @@ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS sst")
 sqlContext.sql("CREATE TEMPORARY FUNCTION sst AS 
'hivemall.anomaly.SingularSpectrumTransformUDF'")
 
 /**
+ * Topic Modeling
+ */
+
+sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS train_lda")
+sqlContext.sql("CREATE TEMPORARY FUNCTION train_lda AS 
'hivemall.topicmodel.LDAUDTF'")
+
+sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS lda_predict")
+sqlContext.sql("CREATE TEMPORARY FUNCTION lda_predict AS 
'hivemall.topicmodel.LDAPredictUDAF'")
+
+/**
  * Smile related features
  */
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/resources/ddl/define-udfs.td.hql
----------------------------------------------------------------------
diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql
index cff0913..28d17ff 100644
--- a/resources/ddl/define-udfs.td.hql
+++ b/resources/ddl/define-udfs.td.hql
@@ -158,6 +158,8 @@ create temporary function guess_attribute_types as 
'hivemall.smile.tools.GuessAt
 -- since Hivemall v0.5-rc.1
 create temporary function changefinder as 'hivemall.anomaly.ChangeFinderUDF';
 create temporary function sst as 
'hivemall.anomaly.SingularSpectrumTransformUDF';
+create temporary function train_lda as 'hivemall.topicmodel.LDAUDTF';
+create temporary function lda_predict as 'hivemall.topicmodel.LDAPredictUDAF';
 
 -- NLP features
 create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/spark/spark-2.0/pom.xml
----------------------------------------------------------------------
diff --git a/spark/spark-2.0/pom.xml b/spark/spark-2.0/pom.xml
index f0937d7..123c424 100644
--- a/spark/spark-2.0/pom.xml
+++ b/spark/spark-2.0/pom.xml
@@ -234,7 +234,7 @@
                                        <junitxml>.</junitxml>
                                        
<filereports>SparkTestSuite.txt</filereports>
                                        <argLine>-ea -Xmx2g 
-XX:MaxPermSize=${MaxPermGen} 
-XX:ReservedCodeCacheSize=${CodeCacheSize}</argLine>
-                                       <stderr/>
+                                       <stderr />
                                        <environmentVariables>
                                                
<SPARK_PREPEND_CLASSES>1</SPARK_PREPEND_CLASSES>
                                                
<SPARK_SCALA_VERSION>${scala.binary.version}</SPARK_SCALA_VERSION>

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/spark/spark-2.1/pom.xml
----------------------------------------------------------------------
diff --git a/spark/spark-2.1/pom.xml b/spark/spark-2.1/pom.xml
index a0f380f..22d3e12 100644
--- a/spark/spark-2.1/pom.xml
+++ b/spark/spark-2.1/pom.xml
@@ -234,7 +234,7 @@
                                        <junitxml>.</junitxml>
                                        
<filereports>SparkTestSuite.txt</filereports>
                                        <argLine>-ea -Xmx2g 
-XX:MaxPermSize=${MaxPermGen} 
-XX:ReservedCodeCacheSize=${CodeCacheSize}</argLine>
-                                       <stderr/>
+                                       <stderr />
                                        <environmentVariables>
                                                
<SPARK_PREPEND_CLASSES>1</SPARK_PREPEND_CLASSES>
                                                
<SPARK_SCALA_VERSION>${scala.binary.version}</SPARK_SCALA_VERSION>

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/spark/spark-common/pom.xml
----------------------------------------------------------------------
diff --git a/spark/spark-common/pom.xml b/spark/spark-common/pom.xml
index 7d76ea4..e8e8ff4 100644
--- a/spark/spark-common/pom.xml
+++ b/spark/spark-common/pom.xml
@@ -69,8 +69,14 @@
 
                <dependency>
                        <groupId>org.apache.hadoop</groupId>
-                       <artifactId>hadoop-core</artifactId>
-                       <version>0.20.2-cdh3u6</version>
+                       <artifactId>hadoop-common</artifactId>
+                       <version>${hadoop.version}</version>
+                       <scope>provided</scope>
+               </dependency>
+               <dependency>
+                       <groupId>org.apache.hadoop</groupId>
+                       <artifactId>hadoop-mapreduce-client-core</artifactId>
+                       <version>${hadoop.version}</version>
                        <scope>provided</scope>
                </dependency>
                <dependency>

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/xgboost/pom.xml
----------------------------------------------------------------------
diff --git a/xgboost/pom.xml b/xgboost/pom.xml
index cd5dc0d..853edb9 100644
--- a/xgboost/pom.xml
+++ b/xgboost/pom.xml
@@ -41,8 +41,14 @@
                <!-- provided scope -->
                <dependency>
                        <groupId>org.apache.hadoop</groupId>
-                       <artifactId>hadoop-core</artifactId>
-                       <version>0.20.2-cdh3u6</version>
+                       <artifactId>hadoop-common</artifactId>
+                       <version>${hadoop.version}</version>
+                       <scope>provided</scope>
+               </dependency>
+               <dependency>
+                       <groupId>org.apache.hadoop</groupId>
+                       <artifactId>hadoop-mapreduce-client-core</artifactId>
+                       <version>${hadoop.version}</version>
                        <scope>provided</scope>
                </dependency>
                <dependency>

[1/2] incubator-hivemall git commit: Close #72: [HIVEMALL-86] Updated Hadoop version dependencies from cdh3 to v2.4.0

Reply via email to