Repository: incubator-hivemall Updated Branches: refs/heads/master 1e4238757 -> 7bb5d047d
[HIVEMALL-146] Yet another UDF to generate n-grams ## What changes were proposed in this pull request? Add a new UDF `to_ngrams(array<string> words, int minSize, int maxSize)` which returns list of n-grams `minSize <= n <= maxSize` for given words. This UDF can be alternative of the original Hive `ngrams` function. ## What type of PR is it? Feature ## What is the Jira issue? https://issues.apache.org/jira/browse/HIVEMALL-146 ## How was this patch tested? Unit test, manual tests both on EMR and local Hive ## How to use this feature? as documented ## Checklist (Please remove this section if not needed; check `x` for YES, blank for NO) - [x] Did you apply source code formatter, i.e., `mvn formatter:format`, for your commit? - [x] Did you run system tests on Hive (or Spark)? Author: Takuya Kitazawa <k.tak...@gmail.com> Closes #118 from takuti/HIVEMALL-146-ngrams. Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/7bb5d047 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/7bb5d047 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/7bb5d047 Branch: refs/heads/master Commit: 7bb5d047dcce7e97336d4b73bb3bd078f2a6fc8a Parents: 1e42387 Author: Takuya Kitazawa <k.tak...@gmail.com> Authored: Wed Oct 4 12:06:26 2017 +0900 Committer: Takuya Kitazawa <tak...@apache.org> Committed: Wed Oct 4 12:06:26 2017 +0900 ---------------------------------------------------------------------- .../java/hivemall/tools/text/WordNgramsUDF.java | 90 ++++++++++++++++++++ .../hivemall/tools/text/WordNgramsUDFTest.java | 87 +++++++++++++++++++ docs/gitbook/misc/generic_funcs.md | 8 ++ resources/ddl/define-all-as-permanent.hive | 3 + resources/ddl/define-all.hive | 3 + resources/ddl/define-all.spark | 3 + resources/ddl/define-udfs.td.hql | 1 + 7 files changed, 195 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7bb5d047/core/src/main/java/hivemall/tools/text/WordNgramsUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/text/WordNgramsUDF.java b/core/src/main/java/hivemall/tools/text/WordNgramsUDF.java new file mode 100644 index 0000000..e4e5504 --- /dev/null +++ b/core/src/main/java/hivemall/tools/text/WordNgramsUDF.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.tools.text; + +import hivemall.utils.lang.StringUtils; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.io.Text; + +import javax.annotation.Nonnegative; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +import java.util.ArrayList; +import java.util.List; + +@Description(name = "word_ngrams", value = "_FUNC_(array<string> words, int minSize, int maxSize])" + + " - Returns list of n-grams for given words, where `minSize <= n <= maxSize`") +@UDFType(deterministic = true, stateful = false) +public final class WordNgramsUDF extends UDF { + + @Nullable + public List<Text> evaluate(@Nullable final List<Text> words, final int minSize, + final int maxSize) throws HiveException { + if (words == null) { + return null; + } + if (minSize <= 0) { + throw new UDFArgumentException("`minSize` must be greater than zero: " + minSize); + } + if (minSize > maxSize) { + throw new UDFArgumentException("`maxSize` must be greater than or equal to `minSize`: " + + maxSize); + } + return getNgrams(words, minSize, maxSize); + } + + @Nonnull + private static List<Text> getNgrams(@Nonnull final List<Text> words, + @Nonnegative final int minSize, @Nonnegative final int maxSize) throws HiveException { + final List<Text> ngrams = new ArrayList<Text>(); + final StringBuilder ngram = new StringBuilder(); + + for (int i = 0, numWords = words.size(); i < numWords; i++) { + for (int ngramSize = minSize; ngramSize <= maxSize; ngramSize++) { + final int end = i + ngramSize; + if (end > numWords) { // exceeds the final element + continue; + } + + StringUtils.clear(ngram); + for (int j = i; j < end; j++) { + final Text word = words.get(j); + if (word == null) { + throw new UDFArgumentException( + "`array<string> words` must not contain NULL element"); + } + if (j > i) { // insert single whitespace between elements + ngram.append(" "); + } + ngram.append(word.toString()); + } + ngrams.add(new Text(ngram.toString())); + } + } + + return ngrams; + } + +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7bb5d047/core/src/test/java/hivemall/tools/text/WordNgramsUDFTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/hivemall/tools/text/WordNgramsUDFTest.java b/core/src/test/java/hivemall/tools/text/WordNgramsUDFTest.java new file mode 100644 index 0000000..9b15e68 --- /dev/null +++ b/core/src/test/java/hivemall/tools/text/WordNgramsUDFTest.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.tools.text; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.Text; + +import java.util.ArrayList; +import java.util.List; + +public class WordNgramsUDFTest { + + private WordNgramsUDF udf; + + @Before + public void setUp() { + this.udf = new WordNgramsUDF(); + } + + @Test + public void testBigram() throws HiveException { + final List<Text> words = new ArrayList<Text>(); + words.add(new Text("machine")); + words.add(new Text("learning")); + + final List<Text> ngrams = udf.evaluate(words, 2, 2); + + Assert.assertTrue(ngrams.size() == 1); + Assert.assertTrue(ngrams.contains(new Text("machine learning"))); + } + + @Test + public void testUniBigram() throws HiveException { + final List<Text> words = new ArrayList<Text>(); + words.add(new Text("machine")); + words.add(new Text("learning")); + + final List<Text> ngrams = udf.evaluate(words, 1, 2); + + Assert.assertTrue(ngrams.size() == 3); + Assert.assertTrue(ngrams.contains(new Text("machine"))); + Assert.assertTrue(ngrams.contains(new Text("learning"))); + Assert.assertTrue(ngrams.contains(new Text("machine learning"))); + } + + @Test(expected = UDFArgumentException.class) + public void testWordsWithNull() throws HiveException { + final List<Text> words = new ArrayList<Text>(); + words.add(new Text("machine")); + words.add(null); + words.add(new Text("learning")); + + udf.evaluate(words, 1, 2); + } + + @Test(expected = UDFArgumentException.class) + public void testInvalidMinSize() throws HiveException { + udf.evaluate(new ArrayList<Text>(), 0, 2); + } + + @Test(expected = UDFArgumentException.class) + public void testInvalidMaxSize() throws HiveException { + udf.evaluate(new ArrayList<Text>(), 2, 1); + } + +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7bb5d047/docs/gitbook/misc/generic_funcs.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/misc/generic_funcs.md b/docs/gitbook/misc/generic_funcs.md index 9775439..b6c7c62 100644 --- a/docs/gitbook/misc/generic_funcs.md +++ b/docs/gitbook/misc/generic_funcs.md @@ -257,6 +257,14 @@ The compression level must be in range [-1,9] > ["kuromoji","使ã","åãã¡æ¸ã","ãã¹ã","第","äº","å¼æ°","normal","search","extended","æå®","ããã©ã«ã","normal"," ã¢ã¼ã"] ``` +- `word_ngrams(array<string> words, int minSize, int maxSize)` - Returns list of n-grams where `minSize <= n <= maxSize` + + ```sql + select word_ngrams(tokenize('Machine learning is fun!', true), 1, 2); + + > ["machine","machine learning","learning","learning is","is","is fun","fun"] + ``` + # Other functions - `convert_label(const int|const float)` - Convert from -1|1 to 0.0f|1.0f, or from 0.0f|1.0f to -1|1 http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7bb5d047/resources/ddl/define-all-as-permanent.hive ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive index d2f0b9f..7906375 100644 --- a/resources/ddl/define-all-as-permanent.hive +++ b/resources/ddl/define-all-as-permanent.hive @@ -553,6 +553,9 @@ CREATE FUNCTION base91 as 'hivemall.tools.text.Base91UDF' USING JAR '${hivemall_ DROP FUNCTION IF EXISTS unbase91; CREATE FUNCTION unbase91 as 'hivemall.tools.text.Unbase91UDF' USING JAR '${hivemall_jar}'; +DROP FUNCTION IF EXISTS word_ngrams; +CREATE FUNCTION word_ngrams as 'hivemall.tools.text.WordNgramsUDF' USING JAR '${hivemall_jar}'; + --------------------------------- -- Dataset generator functions -- --------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7bb5d047/resources/ddl/define-all.hive ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive index 0ef36c3..1b1a035 100644 --- a/resources/ddl/define-all.hive +++ b/resources/ddl/define-all.hive @@ -545,6 +545,9 @@ create temporary function base91 as 'hivemall.tools.text.Base91UDF'; drop temporary function if exists unbase91; create temporary function unbase91 as 'hivemall.tools.text.Unbase91UDF'; +drop temporary function if exists word_ngrams; +create temporary function word_ngrams as 'hivemall.tools.text.WordNgramsUDF'; + --------------------------------- -- Dataset generator functions -- --------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7bb5d047/resources/ddl/define-all.spark ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark index 97307c2..7e6cacd 100644 --- a/resources/ddl/define-all.spark +++ b/resources/ddl/define-all.spark @@ -529,6 +529,9 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION base91 AS 'hivemall.tools.text.Base91U sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS unbase91") sqlContext.sql("CREATE TEMPORARY FUNCTION unbase91 AS 'hivemall.tools.text.Unbase91UDF'") +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS word_ngrams") +sqlContext.sql("CREATE TEMPORARY FUNCTION word_ngrams AS 'hivemall.tools.text.WordNgramsUDF'") + /** * Dataset generator functions */ http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7bb5d047/resources/ddl/define-udfs.td.hql ---------------------------------------------------------------------- diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql index a281b72..4b67fea 100644 --- a/resources/ddl/define-udfs.td.hql +++ b/resources/ddl/define-udfs.td.hql @@ -182,6 +182,7 @@ create temporary function to_ordered_list as 'hivemall.tools.list.UDAFToOrderedL create temporary function singularize as 'hivemall.tools.text.SingularizeUDF'; create temporary function train_slim as 'hivemall.recommend.SlimUDTF'; create temporary function hitrate as 'hivemall.evaluation.HitRateUDAF'; +create temporary function word_ngrams as 'hivemall.tools.text.WordNgramsUDF'; -- NLP features create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';