Repository: incubator-hivemall
Updated Branches:
  refs/heads/master 1e4238757 -> 7bb5d047d


[HIVEMALL-146] Yet another UDF to generate n-grams

## What changes were proposed in this pull request?

Add a new UDF `to_ngrams(array<string> words, int minSize, int maxSize)` which 
returns list of n-grams `minSize <= n <= maxSize` for given words. This UDF can 
be alternative of the original Hive `ngrams` function.

## What type of PR is it?

Feature

## What is the Jira issue?

https://issues.apache.org/jira/browse/HIVEMALL-146

## How was this patch tested?

Unit test, manual tests both on EMR and local Hive

## How to use this feature?

as documented

## Checklist

(Please remove this section if not needed; check `x` for YES, blank for NO)

- [x] Did you apply source code formatter, i.e., `mvn formatter:format`, for 
your commit?
- [x] Did you run system tests on Hive (or Spark)?

Author: Takuya Kitazawa <k.tak...@gmail.com>

Closes #118 from takuti/HIVEMALL-146-ngrams.


Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/7bb5d047
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/7bb5d047
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/7bb5d047

Branch: refs/heads/master
Commit: 7bb5d047dcce7e97336d4b73bb3bd078f2a6fc8a
Parents: 1e42387
Author: Takuya Kitazawa <k.tak...@gmail.com>
Authored: Wed Oct 4 12:06:26 2017 +0900
Committer: Takuya Kitazawa <tak...@apache.org>
Committed: Wed Oct 4 12:06:26 2017 +0900

----------------------------------------------------------------------
 .../java/hivemall/tools/text/WordNgramsUDF.java | 90 ++++++++++++++++++++
 .../hivemall/tools/text/WordNgramsUDFTest.java  | 87 +++++++++++++++++++
 docs/gitbook/misc/generic_funcs.md              |  8 ++
 resources/ddl/define-all-as-permanent.hive      |  3 +
 resources/ddl/define-all.hive                   |  3 +
 resources/ddl/define-all.spark                  |  3 +
 resources/ddl/define-udfs.td.hql                |  1 +
 7 files changed, 195 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7bb5d047/core/src/main/java/hivemall/tools/text/WordNgramsUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/tools/text/WordNgramsUDF.java 
b/core/src/main/java/hivemall/tools/text/WordNgramsUDF.java
new file mode 100644
index 0000000..e4e5504
--- /dev/null
+++ b/core/src/main/java/hivemall/tools/text/WordNgramsUDF.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.tools.text;
+
+import hivemall.utils.lang.StringUtils;
+
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDF;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.UDFType;
+import org.apache.hadoop.io.Text;
+
+import javax.annotation.Nonnegative;
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+
+import java.util.ArrayList;
+import java.util.List;
+
+@Description(name = "word_ngrams", value = "_FUNC_(array<string> words, int 
minSize, int maxSize])"
+        + " - Returns list of n-grams for given words, where `minSize <= n <= 
maxSize`")
+@UDFType(deterministic = true, stateful = false)
+public final class WordNgramsUDF extends UDF {
+
+    @Nullable
+    public List<Text> evaluate(@Nullable final List<Text> words, final int 
minSize,
+            final int maxSize) throws HiveException {
+        if (words == null) {
+            return null;
+        }
+        if (minSize <= 0) {
+            throw new UDFArgumentException("`minSize` must be greater than 
zero: " + minSize);
+        }
+        if (minSize > maxSize) {
+            throw new UDFArgumentException("`maxSize` must be greater than or 
equal to `minSize`: "
+                    + maxSize);
+        }
+        return getNgrams(words, minSize, maxSize);
+    }
+
+    @Nonnull
+    private static List<Text> getNgrams(@Nonnull final List<Text> words,
+            @Nonnegative final int minSize, @Nonnegative final int maxSize) 
throws HiveException {
+        final List<Text> ngrams = new ArrayList<Text>();
+        final StringBuilder ngram = new StringBuilder();
+
+        for (int i = 0, numWords = words.size(); i < numWords; i++) {
+            for (int ngramSize = minSize; ngramSize <= maxSize; ngramSize++) {
+                final int end = i + ngramSize;
+                if (end > numWords) { // exceeds the final element
+                    continue;
+                }
+
+                StringUtils.clear(ngram);
+                for (int j = i; j < end; j++) {
+                    final Text word = words.get(j);
+                    if (word == null) {
+                        throw new UDFArgumentException(
+                            "`array<string> words` must not contain NULL 
element");
+                    }
+                    if (j > i) { // insert single whitespace between elements
+                        ngram.append(" ");
+                    }
+                    ngram.append(word.toString());
+                }
+                ngrams.add(new Text(ngram.toString()));
+            }
+        }
+
+        return ngrams;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7bb5d047/core/src/test/java/hivemall/tools/text/WordNgramsUDFTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/hivemall/tools/text/WordNgramsUDFTest.java 
b/core/src/test/java/hivemall/tools/text/WordNgramsUDFTest.java
new file mode 100644
index 0000000..9b15e68
--- /dev/null
+++ b/core/src/test/java/hivemall/tools/text/WordNgramsUDFTest.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.tools.text;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.io.Text;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class WordNgramsUDFTest {
+
+    private WordNgramsUDF udf;
+
+    @Before
+    public void setUp() {
+        this.udf = new WordNgramsUDF();
+    }
+
+    @Test
+    public void testBigram() throws HiveException {
+        final List<Text> words = new ArrayList<Text>();
+        words.add(new Text("machine"));
+        words.add(new Text("learning"));
+
+        final List<Text> ngrams = udf.evaluate(words, 2, 2);
+
+        Assert.assertTrue(ngrams.size() == 1);
+        Assert.assertTrue(ngrams.contains(new Text("machine learning")));
+    }
+
+    @Test
+    public void testUniBigram() throws HiveException {
+        final List<Text> words = new ArrayList<Text>();
+        words.add(new Text("machine"));
+        words.add(new Text("learning"));
+
+        final List<Text> ngrams = udf.evaluate(words, 1, 2);
+
+        Assert.assertTrue(ngrams.size() == 3);
+        Assert.assertTrue(ngrams.contains(new Text("machine")));
+        Assert.assertTrue(ngrams.contains(new Text("learning")));
+        Assert.assertTrue(ngrams.contains(new Text("machine learning")));
+    }
+
+    @Test(expected = UDFArgumentException.class)
+    public void testWordsWithNull() throws HiveException {
+        final List<Text> words = new ArrayList<Text>();
+        words.add(new Text("machine"));
+        words.add(null);
+        words.add(new Text("learning"));
+
+        udf.evaluate(words, 1, 2);
+    }
+
+    @Test(expected = UDFArgumentException.class)
+    public void testInvalidMinSize() throws HiveException {
+        udf.evaluate(new ArrayList<Text>(), 0, 2);
+    }
+
+    @Test(expected = UDFArgumentException.class)
+    public void testInvalidMaxSize() throws HiveException {
+        udf.evaluate(new ArrayList<Text>(), 2, 1);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7bb5d047/docs/gitbook/misc/generic_funcs.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/misc/generic_funcs.md 
b/docs/gitbook/misc/generic_funcs.md
index 9775439..b6c7c62 100644
--- a/docs/gitbook/misc/generic_funcs.md
+++ b/docs/gitbook/misc/generic_funcs.md
@@ -257,6 +257,14 @@ The compression level must be in range [-1,9]
     > 
["kuromoji","使う","分かち書き","テスト","第","二","引数","normal","search","extended","指定","デフォルト","normal","
 モード"]
     ```
 
+- `word_ngrams(array<string> words, int minSize, int maxSize)` - Returns list 
of n-grams where `minSize <= n <= maxSize`
+
+    ```sql
+    select word_ngrams(tokenize('Machine learning is fun!', true), 1, 2);
+
+    > ["machine","machine learning","learning","learning is","is","is 
fun","fun"]
+    ```
+
 # Other functions
 
 - `convert_label(const int|const float)` - Convert from -1|1 to 0.0f|1.0f, or 
from 0.0f|1.0f to -1|1

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7bb5d047/resources/ddl/define-all-as-permanent.hive
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all-as-permanent.hive 
b/resources/ddl/define-all-as-permanent.hive
index d2f0b9f..7906375 100644
--- a/resources/ddl/define-all-as-permanent.hive
+++ b/resources/ddl/define-all-as-permanent.hive
@@ -553,6 +553,9 @@ CREATE FUNCTION base91 as 'hivemall.tools.text.Base91UDF' 
USING JAR '${hivemall_
 DROP FUNCTION IF EXISTS unbase91;
 CREATE FUNCTION unbase91 as 'hivemall.tools.text.Unbase91UDF' USING JAR 
'${hivemall_jar}';
 
+DROP FUNCTION IF EXISTS word_ngrams;
+CREATE FUNCTION word_ngrams as 'hivemall.tools.text.WordNgramsUDF' USING JAR 
'${hivemall_jar}';
+
 ---------------------------------
 -- Dataset generator functions --
 ---------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7bb5d047/resources/ddl/define-all.hive
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive
index 0ef36c3..1b1a035 100644
--- a/resources/ddl/define-all.hive
+++ b/resources/ddl/define-all.hive
@@ -545,6 +545,9 @@ create temporary function base91 as 
'hivemall.tools.text.Base91UDF';
 drop temporary function if exists unbase91;
 create temporary function unbase91 as 'hivemall.tools.text.Unbase91UDF';
 
+drop temporary function if exists word_ngrams;
+create temporary function word_ngrams as 'hivemall.tools.text.WordNgramsUDF';
+
 ---------------------------------
 -- Dataset generator functions --
 ---------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7bb5d047/resources/ddl/define-all.spark
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark
index 97307c2..7e6cacd 100644
--- a/resources/ddl/define-all.spark
+++ b/resources/ddl/define-all.spark
@@ -529,6 +529,9 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION base91 AS 
'hivemall.tools.text.Base91U
 sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS unbase91")
 sqlContext.sql("CREATE TEMPORARY FUNCTION unbase91 AS 
'hivemall.tools.text.Unbase91UDF'")
 
+sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS word_ngrams")
+sqlContext.sql("CREATE TEMPORARY FUNCTION word_ngrams AS 
'hivemall.tools.text.WordNgramsUDF'")
+
 /**
  * Dataset generator functions
  */

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7bb5d047/resources/ddl/define-udfs.td.hql
----------------------------------------------------------------------
diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql
index a281b72..4b67fea 100644
--- a/resources/ddl/define-udfs.td.hql
+++ b/resources/ddl/define-udfs.td.hql
@@ -182,6 +182,7 @@ create temporary function to_ordered_list as 
'hivemall.tools.list.UDAFToOrderedL
 create temporary function singularize as 'hivemall.tools.text.SingularizeUDF';
 create temporary function train_slim as 'hivemall.recommend.SlimUDTF';
 create temporary function hitrate as 'hivemall.evaluation.HitRateUDAF';
+create temporary function word_ngrams as 'hivemall.tools.text.WordNgramsUDF';
 
 -- NLP features
 create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';

Reply via email to