[4/5] incubator-hivemall git commit: fixed some typos about NLP module

takuti Sat, 01 Jul 2017 06:14:44 -0700

fixed some typos about NLP module


Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/efc3a6de
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/efc3a6de
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/efc3a6de

Branch: refs/heads/master
Commit: efc3a6deecdc65eebf6946c6b1efb253debdca1b
Parents: e24c4fc
Author: partyyoung <[email protected]>
Authored: Sat Jul 1 19:13:13 2017 +0800
Committer: partyyoung <[email protected]>
Committed: Sat Jul 1 19:13:13 2017 +0800

----------------------------------------------------------------------
 docs/gitbook/misc/tokenizer.md                  |  16 +-
 .../java/hivemall/nlp/tokenizer/SmartcnUDF.java | 167 ++++++++++---------
 .../hivemall/nlp/tokenizer/KuromojiUDFTest.java |  12 +-
 .../hivemall/nlp/tokenizer/SmartcnUDFTest.java  |   6 +-
 resources/ddl/define-udfs.td.hql                |   1 +
 5 files changed, 102 insertions(+), 100 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/efc3a6de/docs/gitbook/misc/tokenizer.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/misc/tokenizer.md b/docs/gitbook/misc/tokenizer.md
index a2d3820..99f281d 100644
--- a/docs/gitbook/misc/tokenizer.md
+++ b/docs/gitbook/misc/tokenizer.md
@@ -24,9 +24,9 @@ Hivemall provides simple English text tokenizer UDF that has 
following syntax:
 tokenize(text input, optional boolean toLowerCase = false)
 ```
 
-# Tokenizer for Japanese Texts
+# Tokenizer for Non-English Texts
 
-Hivemall-NLP module provides a Japanese text tokenizer UDF using 
[Kuromoji](https://github.com/atilika/kuromoji). 
+Hivemall-NLP module provides some Non-English Text tokenizer UDFs as follows.
 
 First of all, you need to issue the following DDLs to use the NLP module. Note 
NLP module is not included in 
[hivemall-with-dependencies.jar](https://github.com/myui/hivemall/releases).
 
@@ -34,6 +34,10 @@ First of all, you need to issue the following DDLs to use 
the NLP module. Note N
 
 > source 
 > /tmp/[define-additional.hive](https://github.com/myui/hivemall/releases);
 
+## Japanese Tokenizer
+
+Japanese text tokenizer UDF uses 
[Kuromoji](https://github.com/atilika/kuromoji). 
+
 The signature of the UDF is as follows:
 ```sql
 tokenize_ja(text input, optional const text mode = "normal", optional const 
array<string> stopWords, optional const array<string> stopTags)
@@ -48,13 +52,9 @@ select 
tokenize_ja("kuromojiãä½¿ã£ãåãã¡æ¸ãã®ãã¹ãã§ããç¬¬
 
 For detailed APIs, please refer Javadoc of 
[JapaneseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-kuromoji/org/apache/lucene/analysis/ja/JapaneseAnalyzer.html)
 as well.
 
-# Tokenizer for Chinese Texts
-
-Hivemall-NLP module provides a Chinese text tokenizer UDF using 
[SmartChineseAnalyzer](http://lucene.apache.org/core/5_3_1/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html).
 
+## Chinese Tokenizer
 
-> add jar 
/tmp/[hivemall-nlp-xxx-with-dependencies.jar](https://github.com/myui/hivemall/releases);
-
-> source 
/tmp/[define-additional.hive](https://github.com/myui/hivemall/releases);
+Chinese text tokenizer UDF uses 
[SmartChineseAnalyzer](http://lucene.apache.org/core/5_3_1/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html).
 
 
 The signature of the UDF is as follows:
 ```sql

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/efc3a6de/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
----------------------------------------------------------------------
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java 
b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
index 3d148c9..a016c7e 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
@@ -42,96 +42,97 @@ import 
org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.util.CharArraySet;
 
-@Description(
-        name = "tokenize_cn",
-        value = "_FUNC_(String line [, const list<string> stopWords])"
-                + " - returns tokenized strings in array<string>")
+@Description(name = "tokenize_cn", value = "_FUNC_(String line [, const 
list<string> stopWords])"
+               + " - returns tokenized strings in array<string>")
 @UDFType(deterministic = true, stateful = false)
 public final class SmartcnUDF extends GenericUDF {
 
-    private String[] _stopWordsArray;
+       private String[] _stopWordsArray;
 
-    private transient SmartChineseAnalyzer _analyzer;
+       private transient SmartChineseAnalyzer _analyzer;
 
-    @Override
-    public ObjectInspector initialize(ObjectInspector[] arguments) throws 
UDFArgumentException {
-        final int arglen = arguments.length;
-        if (arglen < 1 || arglen > 2) {
-            throw new UDFArgumentException("Invalid number of arguments for 
`tokenize_cn`: "
-                    + arglen);
-        }
+       @Override
+       public ObjectInspector initialize(ObjectInspector[] arguments)
+                       throws UDFArgumentException {
+               final int arglen = arguments.length;
+               if (arglen < 1 || arglen > 2) {
+                       throw new UDFArgumentException(
+                                       "Invalid number of arguments for 
`tokenize_cn`: " + arglen);
+               }
 
-        this._stopWordsArray = (arglen >= 2) ? 
HiveUtils.getConstStringArray(arguments[1]) : null;
-        this._analyzer = null;
+               this._stopWordsArray = (arglen >= 2) ? HiveUtils
+                               .getConstStringArray(arguments[1]) : null;
+               this._analyzer = null;
 
-        return 
ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
-    }
+               return ObjectInspectorFactory
+                               
.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
+       }
 
-    @Override
-    public List<Text> evaluate(DeferredObject[] arguments) throws 
HiveException {
-        SmartChineseAnalyzer analyzer = _analyzer;
-        if (analyzer == null) {
+       @Override
+       public List<Text> evaluate(DeferredObject[] arguments) throws 
HiveException {
+               SmartChineseAnalyzer analyzer = _analyzer;
+               if (analyzer == null) {
                        CharArraySet stopwords = stopWords(_stopWordsArray);
-            analyzer= new SmartChineseAnalyzer(stopwords);
-            this._analyzer = analyzer;
-        }
-
-        Object arg0 = arguments[0].get();
-        if (arg0 == null) {
-            return null;
-        }
-        String line = arg0.toString();
-
-        final List<Text> results = new ArrayList<Text>(32);
-        TokenStream stream = null;
-        try {
-            stream = analyzer.tokenStream("", line);
-            if (stream != null) {
-                analyzeTokens(stream, results);
-            }
-        } catch (IOException e) {
-            IOUtils.closeQuietly(analyzer);
-            throw new HiveException(e);
-        } finally {
-            IOUtils.closeQuietly(stream);
-        }
-        return results;
-    }
-
-    @Override
-    public void close() throws IOException {
-        IOUtils.closeQuietly(_analyzer);
-    }
-
-
-    @Nonnull
-    private static CharArraySet stopWords(@Nonnull final String[] array)
-            throws UDFArgumentException {
-        if (array == null) {
-            return SmartChineseAnalyzer.getDefaultStopSet();
-        }
-        if (array.length == 0) {
-            return CharArraySet.EMPTY_SET;
-        }
-        CharArraySet results = new CharArraySet(Arrays.asList(array), /* 
ignoreCase */true);
-        return results;
-    }
-
-    private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull 
List<Text> results)
-            throws IOException {
-        // instantiate an attribute placeholder once
-        CharTermAttribute termAttr = 
stream.getAttribute(CharTermAttribute.class);
-        stream.reset();
-
-        while (stream.incrementToken()) {
-            String term = termAttr.toString();
-            results.add(new Text(term));
-        }
-    }
-
-    @Override
-    public String getDisplayString(String[] children) {
-        return "tokenize_cn(" + Arrays.toString(children) + ')';
-    }
-
+                       analyzer = new SmartChineseAnalyzer(stopwords);
+                       this._analyzer = analyzer;
+               }
+
+               Object arg0 = arguments[0].get();
+               if (arg0 == null) {
+                       return null;
+               }
+               String line = arg0.toString();
+
+               final List<Text> results = new ArrayList<Text>(32);
+               TokenStream stream = null;
+               try {
+                       stream = analyzer.tokenStream("", line);
+                       if (stream != null) {
+                               analyzeTokens(stream, results);
+                       }
+               } catch (IOException e) {
+                       IOUtils.closeQuietly(analyzer);
+                       throw new HiveException(e);
+               } finally {
+                       IOUtils.closeQuietly(stream);
+               }
+               return results;
+       }
+
+       @Override
+       public void close() throws IOException {
+               IOUtils.closeQuietly(_analyzer);
+       }
+
+       @Nonnull
+       private static CharArraySet stopWords(@Nonnull final String[] array)
+                       throws UDFArgumentException {
+               if (array == null) {
+                       return SmartChineseAnalyzer.getDefaultStopSet();
+               }
+               if (array.length == 0) {
+                       return CharArraySet.EMPTY_SET;
+               }
+               CharArraySet results = new CharArraySet(Arrays.asList(array), 
/* ignoreCase */
+                               true);
+               return results;
+       }
+
+       private static void analyzeTokens(@Nonnull TokenStream stream,
+                       @Nonnull List<Text> results) throws IOException {
+               // instantiate an attribute placeholder once
+               CharTermAttribute termAttr = stream
+                               .getAttribute(CharTermAttribute.class);
+               stream.reset();
+
+               while (stream.incrementToken()) {
+                       String term = termAttr.toString();
+                       results.add(new Text(term));
+               }
+       }
+
+       @Override
+       public String getDisplayString(String[] children) {
+               return "tokenize_cn(" + Arrays.toString(children) + ')';
+       }
 }

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/efc3a6de/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
----------------------------------------------------------------------
diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java 
b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
index 005e689..7bbaed7 100644
--- a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
+++ b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
@@ -40,7 +40,7 @@ import com.esotericsoftware.kryo.io.Output;
 public class KuromojiUDFTest {
 
     @Test
-    public void testOneArgment() throws UDFArgumentException, IOException {
+    public void testOneArgument() throws UDFArgumentException, IOException {
         GenericUDF udf = new KuromojiUDF();
         ObjectInspector[] argOIs = new ObjectInspector[1];
         // line
@@ -50,7 +50,7 @@ public class KuromojiUDFTest {
     }
 
     @Test
-    public void testTwoArgment() throws UDFArgumentException, IOException {
+    public void testTwoArgument() throws UDFArgumentException, IOException {
         GenericUDF udf = new KuromojiUDF();
         ObjectInspector[] argOIs = new ObjectInspector[2];
         // line
@@ -94,7 +94,7 @@ public class KuromojiUDFTest {
     }
 
     @Test
-    public void testThreeArgment() throws UDFArgumentException, IOException {
+    public void testThreeArgument() throws UDFArgumentException, IOException {
         GenericUDF udf = new KuromojiUDF();
         ObjectInspector[] argOIs = new ObjectInspector[3];
         // line
@@ -112,7 +112,7 @@ public class KuromojiUDFTest {
     }
 
     @Test
-    public void testFourArgment() throws UDFArgumentException, IOException {
+    public void testFourArgument() throws UDFArgumentException, IOException {
         GenericUDF udf = new KuromojiUDF();
         ObjectInspector[] argOIs = new ObjectInspector[4];
         // line
@@ -133,7 +133,7 @@ public class KuromojiUDFTest {
     }
 
     @Test
-    public void testEvalauteOneRow() throws IOException, HiveException {
+    public void testEvaluateOneRow() throws IOException, HiveException {
         KuromojiUDF udf = new KuromojiUDF();
         ObjectInspector[] argOIs = new ObjectInspector[1];
         // line
@@ -156,7 +156,7 @@ public class KuromojiUDFTest {
     }
 
     @Test
-    public void testEvalauteTwoRows() throws IOException, HiveException {
+    public void testEvaluateTwoRows() throws IOException, HiveException {
         KuromojiUDF udf = new KuromojiUDF();
         ObjectInspector[] argOIs = new ObjectInspector[1];
         // line

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/efc3a6de/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
----------------------------------------------------------------------
diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java 
b/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
index 720e532..67c2283 100644
--- a/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
+++ b/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
@@ -35,7 +35,7 @@ import org.junit.Test;
 public class SmartcnUDFTest {
 
        @Test
-       public void testOneArgment() throws UDFArgumentException, IOException {
+       public void testOneArgument() throws UDFArgumentException, IOException {
                GenericUDF udf = new SmartcnUDF();
                ObjectInspector[] argOIs = new ObjectInspector[1];
                // line
@@ -45,7 +45,7 @@ public class SmartcnUDFTest {
        }
 
        @Test
-       public void testTwoArgment() throws UDFArgumentException, IOException {
+       public void testTwoArgument() throws UDFArgumentException, IOException {
                GenericUDF udf = new SmartcnUDF();
                ObjectInspector[] argOIs = new ObjectInspector[2];
                // line
@@ -60,7 +60,7 @@ public class SmartcnUDFTest {
        }
 
        @Test
-       public void testEvalauteOneRow() throws IOException, HiveException {
+       public void testEvaluateOneRow() throws IOException, HiveException {
                SmartcnUDF udf = new SmartcnUDF();
                ObjectInspector[] argOIs = new ObjectInspector[1];
                // line

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/efc3a6de/resources/ddl/define-udfs.td.hql
----------------------------------------------------------------------
diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql
index 1d11d1a..953a6ac 100644
--- a/resources/ddl/define-udfs.td.hql
+++ b/resources/ddl/define-udfs.td.hql
@@ -176,6 +176,7 @@ create temporary function train_regression as 
'hivemall.regression.GeneralRegres
 
 -- NLP features
 create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';
+create temporary function tokenize_cn as 'hivemall.nlp.tokenizer.SmartcnUDF';
 
 -- Backward compatibilities
 create temporary function concat_array as 
'hivemall.tools.array.ArrayConcatUDF';

[4/5] incubator-hivemall git commit: fixed some typos about NLP module

Reply via email to