Repository: incubator-hivemall Updated Branches: refs/heads/master 8aae974fc -> cb16a3944
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/core/src/test/java/hivemall/topicmodel/OnlineLDAModelTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/hivemall/topicmodel/OnlineLDAModelTest.java b/core/src/test/java/hivemall/topicmodel/OnlineLDAModelTest.java new file mode 100644 index 0000000..e151943 --- /dev/null +++ b/core/src/test/java/hivemall/topicmodel/OnlineLDAModelTest.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.topicmodel; + +import java.util.Map; +import java.util.List; +import java.util.SortedMap; + +import org.junit.Assert; +import org.junit.Test; + +public class OnlineLDAModelTest { + private static final boolean DEBUG = false; + + @Test + public void test() { + int K = 2; + int it = 0; + float perplexityPrev; + float perplexity = Float.MAX_VALUE; + + OnlineLDAModel model = new OnlineLDAModel(K, 1.f / K, 1.f / K, 2, 80, 0.8, 1E-5d); + + String[] doc1 = new String[] {"fruits:1", "healthy:1", "vegetables:1"}; + String[] doc2 = new String[] {"apples:1", "avocados:1", "colds:1", "flu:1", "like:2", "oranges:1"}; + + do { + perplexityPrev = perplexity; + perplexity = 0.f; + + // online (i.e., one-by-one) updating + model.train(new String[][] {doc1}); + perplexity += model.computePerplexity(); + + model.train(new String[][] {doc2}); + perplexity += model.computePerplexity(); + + perplexity /= 2.f; // mean perplexity for the 2 docs + + it++; + println("Iteration " + it + ": mean perplexity = " + perplexity); + } while(Math.abs(perplexityPrev - perplexity) >= 1E-6f); + + SortedMap<Float, List<String>> topicWords; + + println("Topic 0:"); + println("========"); + topicWords = model.getTopicWords(0); + for (Map.Entry<Float, List<String>> e : topicWords.entrySet()) { + List<String> words = e.getValue(); + for (int i = 0; i < words.size(); i++) { + println(e.getKey() + " " + words.get(i)); + } + } + println("========"); + + println("Topic 1:"); + println("========"); + topicWords = model.getTopicWords(1); + for (Map.Entry<Float, List<String>> e : topicWords.entrySet()) { + List<String> words = e.getValue(); + for (int i = 0; i < words.size(); i++) { + println(e.getKey() + " " + words.get(i)); + } + } + println("========"); + + int k1, k2; + float[] topicDistr = model.getTopicDistribution(doc1); + if (topicDistr[0] > topicDistr[1]) { + // topic 0 MUST represent doc#1 + k1 = 0; + k2 = 1; + } else { + k1 = 1; + k2 = 0; + } + Assert.assertTrue("doc1 is in topic " + k1 + " (" + (topicDistr[k1] * 100) + "%), " + + "and `vegetables` SHOULD be more suitable topic word than `flu` in the topic", + model.getLambda("vegetables", k1) > model.getLambda("flu", k1)); + Assert.assertTrue("doc2 is in topic " + k2 + " (" + (topicDistr[k2] * 100) + "%), " + + "and `avocados` SHOULD be more suitable topic word than `healthy` in the topic", + model.getLambda("avocados", k2) > model.getLambda("healthy", k2)); + } + + private static void println(String msg) { + if (DEBUG) { + System.out.println(msg); + } + } + +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/docs/gitbook/SUMMARY.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/SUMMARY.md b/docs/gitbook/SUMMARY.md index 4c6ed1b..78b1faa 100644 --- a/docs/gitbook/SUMMARY.md +++ b/docs/gitbook/SUMMARY.md @@ -150,7 +150,11 @@ * [Change-Point Detection using Singular Spectrum Transformation (SST)](anomaly/sst.md) * [ChangeFinder: Detecting Outlier and Change-Point Simultaneously](anomaly/changefinder.md) -## Part X - Hivemall on Spark +## Part X - Clustering + +* [Latent Dirichlet Allocation](clustering/lda.md) + +## Part XI - Hivemall on Spark * [Getting Started](spark/getting_started/README.md) * [Installation](spark/getting_started/installation.md) @@ -165,7 +169,7 @@ * [Top-k Join processing](spark/misc/topk_join.md) * [Other utility functions](spark/misc/functions.md) -## Part X - External References +## Part XII - External References * [Hivemall on Apache Spark](https://github.com/maropu/hivemall-spark) * [Hivemall on Apache Pig](https://github.com/daijyc/hivemall/wiki/PigHome) http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/docs/gitbook/clustering/lda.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/clustering/lda.md b/docs/gitbook/clustering/lda.md new file mode 100644 index 0000000..1998934 --- /dev/null +++ b/docs/gitbook/clustering/lda.md @@ -0,0 +1,170 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +Topic modeling is a way to analyze massive documents by clustering them into some ***topics***. In particular, **Latent Dirichlet Allocation** (LDA) is one of the most popular topic modeling techniques; papers introduce the method are as follows: + +- D. M. Blei, et al. [Latent Dirichlet Allocation](http://www.jmlr.org/papers/v3/blei03a.html). Journal of Machine Learning Research 3, pp. 993-1022, 2003. +- M. D. Hoffman, et al. [Online Learning for Latent Dirichlet Allocation](https://papers.nips.cc/paper/3902-online-learning-for-latent-dirichlet-allocation). NIPS 2010. + +Hivemall enables you to analyze your documents based on LDA. This page gives usage instructions of the feature. + +<!-- toc --> + +*Note: This feature is supported from Hivemall v0.5-rc.1 or later.* + +# Prepare document data + +Assume that we already have a table `docs` which contains many documents as string format: + +| docid | doc | +|:---:|:---| +| 1 | "Fruits and vegetables are healthy." | +|2 | "I like apples, oranges, and avocados. I do not like the flu or colds." | +| ... | ... | + +Hivemall has several functions which are particularly useful for text processing. More specifically, by using `tokenize()` and `is_stopword()`, you can immediately convert the documents to [bag-of-words](https://en.wikipedia.org/wiki/Bag-of-words_model)-like format: + +```sql +select + docid, + feature(word, count(word)) as word_count +from docs t1 LATERAL VIEW explode(tokenize(doc, true)) t2 as word +where + not is_stopword(word) +group by + docid, word +; +``` + +| docid | word_count | +|:---:|:---| +|1 | fruits:1 | +|1 | healthy:1| +|1 | vegetables:1 | +|2 | apples:1 | +|2 | avocados:1 | +|2 | colds:1 | +|2 | flu:1 | +|2 | like:2 | +|2| oranges:1 | + +# Building Topic Models and Finding Topic Words + +For each document, collecting `word_count`s in the last table creates a feature vector as an input to the `train_lda()` function: + +```sql +with word_counts as ( + select + docid, + feature(word, count(word)) as word_count + from docs t1 LATERAL VIEW explode(tokenize(doc, true)) t2 as word + where + not is_stopword(word) + group by + docid, word +) +select + train_lda(feature, "-topic 2 -iter 20") as (label, word, lambda) +from ( + select docid, collect_set(word_count) as feature + from word_counts + group by docid +) t +; +``` + +Here, an option `-topic 2` specifies the number of topics we assume in the set of documents. + +Eventually, a new table `lda_model` is generated as shown below: + +|label | word | lambda | +|:---:|:---:|:---:| +|0 | fruits | 0.33372128| +|0 | vegetables | 0.33272517| +|0 | healthy | 0.33246377| +|0 | flu | 2.3617347E-4| +|0 | apples | 2.1898883E-4| +|0 | oranges | 1.8161473E-4| +|0 | like | 1.7666373E-4| +|0 | avocados | 1.726186E-4| +|0 | colds | 1.037139E-4| +|1 | colds | 0.16622013| +|1 | avocados | 0.16618845| +|1 | oranges | 0.1661859| +|1 | like | 0.16618414| +|1 | apples | 0.16616651| +|1 | flu | 0.16615893| +|1 | healthy | 0.0012059759| +|1 | vegetables | 0.0010818697| +|1 | fruits | 6.080827E-4| + +In the table, `label` indicates a topic index, and `lambda` is a value which represents how each word is likely to characterize a topic. That is, we can say that, in terms of `lambda`, top-N words are the ***topic words*** of a topic. + +Obviously, we can observe that topic `0` corresponds to document `1`, and topic `1` represents words in document `2`. + +# Predicting Topic Assignments of Documents + +Once you have constructed topic models as described before, a function `lda_predict()` allows you to predict topic assignments of documents. + +For example, if we consider the `docs` table, the exactly same set of documents as used for training, probability that a document is assigned to a topic can be computed by: + +```sql +with test as ( + select + docid, + word, + count(word) as value + from docs t1 LATERAL VIEW explode(tokenize(doc, true)) t2 as word + where + not is_stopword(word) + group by + docid, word +) +select + t.docid, + lda_predict(t.word, t.value, m.label, m.lambda, "-topic 2") as probabilities +from + test t + JOIN lda_model m ON (t.word = m.word) +group by + t.docid +; +``` + +| docid | probabilities (sorted by probabilities) | +|:---:|:---| +|1 | [{"label":0,"probability":0.875},{"label":1,"probability":0.125}]| +|2 | [{"label":1,"probability":0.9375},{"label":0,"probability":0.0625}]| + +Importantly, an option `-topic` should be set to the same value as you set for training. + +Since the probabilities are sorted in descending order, a label of the most promising topic is easily obtained as: + +```sql +select docid, probabilities[0].label +from topic +; +``` + +| docid | label | +|:---:|:---:| +| 1 | 0 | +| 2 | 1 | + +Of course, using the different set of documents for prediction is possible. Predicting topic assignments of newly observed documents should be more realistic scenario. http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/docs/gitbook/getting_started/installation.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/getting_started/installation.md b/docs/gitbook/getting_started/installation.md index 3a3c97f..896d247 100644 --- a/docs/gitbook/getting_started/installation.md +++ b/docs/gitbook/getting_started/installation.md @@ -20,7 +20,8 @@ Prerequisites ============ -* Hive v0.12 or later +* Hadoop v2.4.0 or later +* Hive v0.13 or later * Java 7 or later * [hivemall-core-xxx-with-dependencies.jar](https://github.com/myui/hivemall/releases) * [define-all.hive](https://github.com/myui/hivemall/releases) @@ -41,4 +42,15 @@ This automatically loads all Hivemall functions every time you start a Hive sess $ hive add jar /tmp/hivemall-core-xxx-with-dependencies.jar; source /tmp/define-all.hive; -``` \ No newline at end of file +``` + +Build from Source +================== + +```sh +$ git clone https://github.com/apache/incubator-hivemall.git +$ cd incubator-hivemall +$ bin/build.sh +``` + +Then, you can find hivemall jars in `./target`. http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/mixserv/pom.xml ---------------------------------------------------------------------- diff --git a/mixserv/pom.xml b/mixserv/pom.xml index 41ba401..0e0e83c 100644 --- a/mixserv/pom.xml +++ b/mixserv/pom.xml @@ -39,8 +39,14 @@ <!-- provided scope --> <dependency> <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-core</artifactId> - <version>0.20.2-cdh3u6</version> + <artifactId>hadoop-common</artifactId> + <version>${hadoop.version}</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-mapreduce-client-core</artifactId> + <version>${hadoop.version}</version> <scope>provided</scope> </dependency> <dependency> @@ -74,9 +80,9 @@ <scope>provided</scope> </dependency> <dependency> - <groupId>org.apache.hadoop.thirdparty.guava</groupId> + <groupId>com.google.guava</groupId> <artifactId>guava</artifactId> - <version>r09-jarjar</version> + <version>${guava.version}</version> <scope>provided</scope> </dependency> @@ -124,7 +130,7 @@ <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> - <version>4.12</version> + <version>${junit.version}</version> <scope>test</scope> </dependency> <dependency> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/nlp/pom.xml ---------------------------------------------------------------------- diff --git a/nlp/pom.xml b/nlp/pom.xml index c7d2cef..b6ea409 100644 --- a/nlp/pom.xml +++ b/nlp/pom.xml @@ -39,8 +39,14 @@ <!-- provided scope --> <dependency> <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-core</artifactId> - <version>0.20.2-cdh3u6</version> + <artifactId>hadoop-common</artifactId> + <version>${hadoop.version}</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-mapreduce-client-core</artifactId> + <version>${hadoop.version}</version> <scope>provided</scope> </dependency> <dependency> @@ -92,9 +98,9 @@ <scope>provided</scope> </dependency> <dependency> - <groupId>org.apache.hadoop.thirdparty.guava</groupId> + <groupId>com.google.guava</groupId> <artifactId>guava</artifactId> - <version>r09-jarjar</version> + <version>${guava.version}</version> <scope>provided</scope> </dependency> <dependency> @@ -116,7 +122,7 @@ <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> - <version>4.12</version> + <version>${junit.version}</version> <scope>test</scope> </dependency> <dependency> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 7743d5a..63abe87 100644 --- a/pom.xml +++ b/pom.xml @@ -25,7 +25,7 @@ <version>0.4.2-rc.2</version> <name>Apache Hivemall</name> - <description>Scalable Machine Learning Library for Apache Hive</description> + <description>Scalable Machine Learning Library for Apache Hive, Apache Spark, and Apache Pig</description> <url>http://hivemall.incubator.apache.org/</url> <inceptionYear>2013</inceptionYear> <organization> @@ -158,8 +158,8 @@ <name>Takuya Kitazawa</name> <email>takuti[at]apache.org</email> <url>https://github.com/takuti</url> - <organization>The University of Tokyo</organization> - <organizationUrl>http://www.u-tokyo.ac.jp/</organizationUrl> + <organization>Treasure Data, Inc.</organization> + <organizationUrl>https://www.treasuredata.com/</organizationUrl> <roles> <role>Committer</role> </roles> @@ -241,25 +241,23 @@ </modules> <properties> + <java.source.version>1.7</java.source.version> + <java.target.version>1.7</java.target.version> + <scala.version>2.11.8</scala.version> + <scala.binary.version>2.11</scala.binary.version> <maven.build.timestamp.format>yyyy</maven.build.timestamp.format> <build.year>${maven.build.timestamp}</build.year> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <protobuf.version>2.5.0</protobuf.version> <protoc.path>${env.PROTOC_PATH}</protoc.path> + <hadoop.version>2.4.0</hadoop.version> <hive.version>0.13.0</hive.version> - <scala.version>2.11.8</scala.version> + <guava.version>11.0.2</guava.version> + <junit.version>4.12</junit.version> <dependency.locations.enabled>false</dependency.locations.enabled> - <scala.binary.version>2.11</scala.binary.version> <main.basedir>${project.basedir}</main.basedir> </properties> - <repositories> - <repository> - <id>cloudera</id> - <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url> - </repository> - </repositories> - <distributionManagement> <snapshotRepository> <id>ossrh</id> @@ -275,7 +273,6 @@ <module>spark/spark-common</module> </modules> <properties> - <hadoop.version>2.7</hadoop.version> <spark.version>2.1.0</spark.version> <spark.binary.version>2.1</spark.binary.version> </properties> @@ -287,7 +284,6 @@ <module>spark/spark-common</module> </modules> <properties> - <hadoop.version>2.7</hadoop.version> <spark.version>2.0.2</spark.version> <spark.binary.version>2.0</spark.binary.version> </properties> @@ -480,8 +476,8 @@ <artifactId>maven-compiler-plugin</artifactId> <version>3.1</version> <configuration> - <source>1.7</source> - <target>1.7</target> + <source>${java.source.version}</source> + <target>${java.target.version}</target> <debug>true</debug> <debuglevel>lines,vars,source</debuglevel> <encoding>UTF-8</encoding> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/resources/ddl/define-all-as-permanent.hive ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive index c6dda03..1eb9c82 100644 --- a/resources/ddl/define-all-as-permanent.hive +++ b/resources/ddl/define-all-as-permanent.hive @@ -616,6 +616,16 @@ CREATE FUNCTION changefinder as 'hivemall.anomaly.ChangeFinderUDF' USING JAR '${ DROP FUNCTION IF EXISTS sst; CREATE FUNCTION sst as 'hivemall.anomaly.SingularSpectrumTransformUDF' USING JAR '${hivemall_jar}'; +-------------------- +-- Topic Modeling -- +-------------------- + +DROP FUNCTION IF EXISTS train_lda; +CREATE FUNCTION train_lda as 'hivemall.topicmodel.LDAUDTF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS lda_predict; +CREATE FUNCTION lda_predict as 'hivemall.topicmodel.LDAPredictUDAF' USING JAR '${hivemall_jar}'; + ---------------------------- -- Smile related features -- ---------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/resources/ddl/define-all.hive ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive index 8ea16c1..b503546 100644 --- a/resources/ddl/define-all.hive +++ b/resources/ddl/define-all.hive @@ -612,6 +612,16 @@ create temporary function changefinder as 'hivemall.anomaly.ChangeFinderUDF'; drop temporary function if exists sst; create temporary function sst as 'hivemall.anomaly.SingularSpectrumTransformUDF'; +-------------------- +-- Topic Modeling -- +-------------------- + +drop temporary function if exists train_lda; +create temporary function train_lda as 'hivemall.topicmodel.LDAUDTF'; + +drop temporary function if exists lda_predict; +create temporary function lda_predict as 'hivemall.topicmodel.LDAPredictUDAF'; + ---------------------------- -- Smile related features -- ---------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/resources/ddl/define-all.spark ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark index 0172cc8..b5239cf 100644 --- a/resources/ddl/define-all.spark +++ b/resources/ddl/define-all.spark @@ -597,6 +597,16 @@ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS sst") sqlContext.sql("CREATE TEMPORARY FUNCTION sst AS 'hivemall.anomaly.SingularSpectrumTransformUDF'") /** + * Topic Modeling + */ + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS train_lda") +sqlContext.sql("CREATE TEMPORARY FUNCTION train_lda AS 'hivemall.topicmodel.LDAUDTF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS lda_predict") +sqlContext.sql("CREATE TEMPORARY FUNCTION lda_predict AS 'hivemall.topicmodel.LDAPredictUDAF'") + +/** * Smile related features */ http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/resources/ddl/define-udfs.td.hql ---------------------------------------------------------------------- diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql index cff0913..28d17ff 100644 --- a/resources/ddl/define-udfs.td.hql +++ b/resources/ddl/define-udfs.td.hql @@ -158,6 +158,8 @@ create temporary function guess_attribute_types as 'hivemall.smile.tools.GuessAt -- since Hivemall v0.5-rc.1 create temporary function changefinder as 'hivemall.anomaly.ChangeFinderUDF'; create temporary function sst as 'hivemall.anomaly.SingularSpectrumTransformUDF'; +create temporary function train_lda as 'hivemall.topicmodel.LDAUDTF'; +create temporary function lda_predict as 'hivemall.topicmodel.LDAPredictUDAF'; -- NLP features create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF'; http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/spark/spark-2.0/pom.xml ---------------------------------------------------------------------- diff --git a/spark/spark-2.0/pom.xml b/spark/spark-2.0/pom.xml index f0937d7..123c424 100644 --- a/spark/spark-2.0/pom.xml +++ b/spark/spark-2.0/pom.xml @@ -234,7 +234,7 @@ <junitxml>.</junitxml> <filereports>SparkTestSuite.txt</filereports> <argLine>-ea -Xmx2g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=${CodeCacheSize}</argLine> - <stderr/> + <stderr /> <environmentVariables> <SPARK_PREPEND_CLASSES>1</SPARK_PREPEND_CLASSES> <SPARK_SCALA_VERSION>${scala.binary.version}</SPARK_SCALA_VERSION> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/spark/spark-2.1/pom.xml ---------------------------------------------------------------------- diff --git a/spark/spark-2.1/pom.xml b/spark/spark-2.1/pom.xml index a0f380f..22d3e12 100644 --- a/spark/spark-2.1/pom.xml +++ b/spark/spark-2.1/pom.xml @@ -234,7 +234,7 @@ <junitxml>.</junitxml> <filereports>SparkTestSuite.txt</filereports> <argLine>-ea -Xmx2g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=${CodeCacheSize}</argLine> - <stderr/> + <stderr /> <environmentVariables> <SPARK_PREPEND_CLASSES>1</SPARK_PREPEND_CLASSES> <SPARK_SCALA_VERSION>${scala.binary.version}</SPARK_SCALA_VERSION> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/spark/spark-common/pom.xml ---------------------------------------------------------------------- diff --git a/spark/spark-common/pom.xml b/spark/spark-common/pom.xml index 7d76ea4..e8e8ff4 100644 --- a/spark/spark-common/pom.xml +++ b/spark/spark-common/pom.xml @@ -69,8 +69,14 @@ <dependency> <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-core</artifactId> - <version>0.20.2-cdh3u6</version> + <artifactId>hadoop-common</artifactId> + <version>${hadoop.version}</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-mapreduce-client-core</artifactId> + <version>${hadoop.version}</version> <scope>provided</scope> </dependency> <dependency> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/cb16a394/xgboost/pom.xml ---------------------------------------------------------------------- diff --git a/xgboost/pom.xml b/xgboost/pom.xml index cd5dc0d..853edb9 100644 --- a/xgboost/pom.xml +++ b/xgboost/pom.xml @@ -41,8 +41,14 @@ <!-- provided scope --> <dependency> <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-core</artifactId> - <version>0.20.2-cdh3u6</version> + <artifactId>hadoop-common</artifactId> + <version>${hadoop.version}</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-mapreduce-client-core</artifactId> + <version>${hadoop.version}</version> <scope>provided</scope> </dependency> <dependency>
