fixed some typos about NLP module
Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/efc3a6de Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/efc3a6de Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/efc3a6de Branch: refs/heads/master Commit: efc3a6deecdc65eebf6946c6b1efb253debdca1b Parents: e24c4fc Author: partyyoung <[email protected]> Authored: Sat Jul 1 19:13:13 2017 +0800 Committer: partyyoung <[email protected]> Committed: Sat Jul 1 19:13:13 2017 +0800 ---------------------------------------------------------------------- docs/gitbook/misc/tokenizer.md | 16 +- .../java/hivemall/nlp/tokenizer/SmartcnUDF.java | 167 ++++++++++--------- .../hivemall/nlp/tokenizer/KuromojiUDFTest.java | 12 +- .../hivemall/nlp/tokenizer/SmartcnUDFTest.java | 6 +- resources/ddl/define-udfs.td.hql | 1 + 5 files changed, 102 insertions(+), 100 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/efc3a6de/docs/gitbook/misc/tokenizer.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/misc/tokenizer.md b/docs/gitbook/misc/tokenizer.md index a2d3820..99f281d 100644 --- a/docs/gitbook/misc/tokenizer.md +++ b/docs/gitbook/misc/tokenizer.md @@ -24,9 +24,9 @@ Hivemall provides simple English text tokenizer UDF that has following syntax: tokenize(text input, optional boolean toLowerCase = false) ``` -# Tokenizer for Japanese Texts +# Tokenizer for Non-English Texts -Hivemall-NLP module provides a Japanese text tokenizer UDF using [Kuromoji](https://github.com/atilika/kuromoji). +Hivemall-NLP module provides some Non-English Text tokenizer UDFs as follows. First of all, you need to issue the following DDLs to use the NLP module. Note NLP module is not included in [hivemall-with-dependencies.jar](https://github.com/myui/hivemall/releases). @@ -34,6 +34,10 @@ First of all, you need to issue the following DDLs to use the NLP module. Note N > source > /tmp/[define-additional.hive](https://github.com/myui/hivemall/releases); +## Japanese Tokenizer + +Japanese text tokenizer UDF uses [Kuromoji](https://github.com/atilika/kuromoji). + The signature of the UDF is as follows: ```sql tokenize_ja(text input, optional const text mode = "normal", optional const array<string> stopWords, optional const array<string> stopTags) @@ -48,13 +52,9 @@ select tokenize_ja("kuromojiã使ã£ãåãã¡æ¸ãã®ãã¹ãã§ãã第 For detailed APIs, please refer Javadoc of [JapaneseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-kuromoji/org/apache/lucene/analysis/ja/JapaneseAnalyzer.html) as well. -# Tokenizer for Chinese Texts - -Hivemall-NLP module provides a Chinese text tokenizer UDF using [SmartChineseAnalyzer](http://lucene.apache.org/core/5_3_1/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html). +## Chinese Tokenizer -> add jar /tmp/[hivemall-nlp-xxx-with-dependencies.jar](https://github.com/myui/hivemall/releases); - -> source /tmp/[define-additional.hive](https://github.com/myui/hivemall/releases); +Chinese text tokenizer UDF uses [SmartChineseAnalyzer](http://lucene.apache.org/core/5_3_1/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html). The signature of the UDF is as follows: ```sql http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/efc3a6de/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java ---------------------------------------------------------------------- diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java index 3d148c9..a016c7e 100644 --- a/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java +++ b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java @@ -42,96 +42,97 @@ import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.util.CharArraySet; -@Description( - name = "tokenize_cn", - value = "_FUNC_(String line [, const list<string> stopWords])" - + " - returns tokenized strings in array<string>") +@Description(name = "tokenize_cn", value = "_FUNC_(String line [, const list<string> stopWords])" + + " - returns tokenized strings in array<string>") @UDFType(deterministic = true, stateful = false) public final class SmartcnUDF extends GenericUDF { - private String[] _stopWordsArray; + private String[] _stopWordsArray; - private transient SmartChineseAnalyzer _analyzer; + private transient SmartChineseAnalyzer _analyzer; - @Override - public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { - final int arglen = arguments.length; - if (arglen < 1 || arglen > 2) { - throw new UDFArgumentException("Invalid number of arguments for `tokenize_cn`: " - + arglen); - } + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) + throws UDFArgumentException { + final int arglen = arguments.length; + if (arglen < 1 || arglen > 2) { + throw new UDFArgumentException( + "Invalid number of arguments for `tokenize_cn`: " + arglen); + } - this._stopWordsArray = (arglen >= 2) ? HiveUtils.getConstStringArray(arguments[1]) : null; - this._analyzer = null; + this._stopWordsArray = (arglen >= 2) ? HiveUtils + .getConstStringArray(arguments[1]) : null; + this._analyzer = null; - return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector); - } + return ObjectInspectorFactory + .getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector); + } - @Override - public List<Text> evaluate(DeferredObject[] arguments) throws HiveException { - SmartChineseAnalyzer analyzer = _analyzer; - if (analyzer == null) { + @Override + public List<Text> evaluate(DeferredObject[] arguments) throws HiveException { + SmartChineseAnalyzer analyzer = _analyzer; + if (analyzer == null) { CharArraySet stopwords = stopWords(_stopWordsArray); - analyzer= new SmartChineseAnalyzer(stopwords); - this._analyzer = analyzer; - } - - Object arg0 = arguments[0].get(); - if (arg0 == null) { - return null; - } - String line = arg0.toString(); - - final List<Text> results = new ArrayList<Text>(32); - TokenStream stream = null; - try { - stream = analyzer.tokenStream("", line); - if (stream != null) { - analyzeTokens(stream, results); - } - } catch (IOException e) { - IOUtils.closeQuietly(analyzer); - throw new HiveException(e); - } finally { - IOUtils.closeQuietly(stream); - } - return results; - } - - @Override - public void close() throws IOException { - IOUtils.closeQuietly(_analyzer); - } - - - @Nonnull - private static CharArraySet stopWords(@Nonnull final String[] array) - throws UDFArgumentException { - if (array == null) { - return SmartChineseAnalyzer.getDefaultStopSet(); - } - if (array.length == 0) { - return CharArraySet.EMPTY_SET; - } - CharArraySet results = new CharArraySet(Arrays.asList(array), /* ignoreCase */true); - return results; - } - - private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> results) - throws IOException { - // instantiate an attribute placeholder once - CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); - stream.reset(); - - while (stream.incrementToken()) { - String term = termAttr.toString(); - results.add(new Text(term)); - } - } - - @Override - public String getDisplayString(String[] children) { - return "tokenize_cn(" + Arrays.toString(children) + ')'; - } - + analyzer = new SmartChineseAnalyzer(stopwords); + this._analyzer = analyzer; + } + + Object arg0 = arguments[0].get(); + if (arg0 == null) { + return null; + } + String line = arg0.toString(); + + final List<Text> results = new ArrayList<Text>(32); + TokenStream stream = null; + try { + stream = analyzer.tokenStream("", line); + if (stream != null) { + analyzeTokens(stream, results); + } + } catch (IOException e) { + IOUtils.closeQuietly(analyzer); + throw new HiveException(e); + } finally { + IOUtils.closeQuietly(stream); + } + return results; + } + + @Override + public void close() throws IOException { + IOUtils.closeQuietly(_analyzer); + } + + @Nonnull + private static CharArraySet stopWords(@Nonnull final String[] array) + throws UDFArgumentException { + if (array == null) { + return SmartChineseAnalyzer.getDefaultStopSet(); + } + if (array.length == 0) { + return CharArraySet.EMPTY_SET; + } + CharArraySet results = new CharArraySet(Arrays.asList(array), /* ignoreCase */ + true); + return results; + } + + private static void analyzeTokens(@Nonnull TokenStream stream, + @Nonnull List<Text> results) throws IOException { + // instantiate an attribute placeholder once + CharTermAttribute termAttr = stream + .getAttribute(CharTermAttribute.class); + stream.reset(); + + while (stream.incrementToken()) { + String term = termAttr.toString(); + results.add(new Text(term)); + } + } + + @Override + public String getDisplayString(String[] children) { + return "tokenize_cn(" + Arrays.toString(children) + ')'; + } } http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/efc3a6de/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java ---------------------------------------------------------------------- diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java index 005e689..7bbaed7 100644 --- a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java +++ b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java @@ -40,7 +40,7 @@ import com.esotericsoftware.kryo.io.Output; public class KuromojiUDFTest { @Test - public void testOneArgment() throws UDFArgumentException, IOException { + public void testOneArgument() throws UDFArgumentException, IOException { GenericUDF udf = new KuromojiUDF(); ObjectInspector[] argOIs = new ObjectInspector[1]; // line @@ -50,7 +50,7 @@ public class KuromojiUDFTest { } @Test - public void testTwoArgment() throws UDFArgumentException, IOException { + public void testTwoArgument() throws UDFArgumentException, IOException { GenericUDF udf = new KuromojiUDF(); ObjectInspector[] argOIs = new ObjectInspector[2]; // line @@ -94,7 +94,7 @@ public class KuromojiUDFTest { } @Test - public void testThreeArgment() throws UDFArgumentException, IOException { + public void testThreeArgument() throws UDFArgumentException, IOException { GenericUDF udf = new KuromojiUDF(); ObjectInspector[] argOIs = new ObjectInspector[3]; // line @@ -112,7 +112,7 @@ public class KuromojiUDFTest { } @Test - public void testFourArgment() throws UDFArgumentException, IOException { + public void testFourArgument() throws UDFArgumentException, IOException { GenericUDF udf = new KuromojiUDF(); ObjectInspector[] argOIs = new ObjectInspector[4]; // line @@ -133,7 +133,7 @@ public class KuromojiUDFTest { } @Test - public void testEvalauteOneRow() throws IOException, HiveException { + public void testEvaluateOneRow() throws IOException, HiveException { KuromojiUDF udf = new KuromojiUDF(); ObjectInspector[] argOIs = new ObjectInspector[1]; // line @@ -156,7 +156,7 @@ public class KuromojiUDFTest { } @Test - public void testEvalauteTwoRows() throws IOException, HiveException { + public void testEvaluateTwoRows() throws IOException, HiveException { KuromojiUDF udf = new KuromojiUDF(); ObjectInspector[] argOIs = new ObjectInspector[1]; // line http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/efc3a6de/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java ---------------------------------------------------------------------- diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java index 720e532..67c2283 100644 --- a/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java +++ b/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java @@ -35,7 +35,7 @@ import org.junit.Test; public class SmartcnUDFTest { @Test - public void testOneArgment() throws UDFArgumentException, IOException { + public void testOneArgument() throws UDFArgumentException, IOException { GenericUDF udf = new SmartcnUDF(); ObjectInspector[] argOIs = new ObjectInspector[1]; // line @@ -45,7 +45,7 @@ public class SmartcnUDFTest { } @Test - public void testTwoArgment() throws UDFArgumentException, IOException { + public void testTwoArgument() throws UDFArgumentException, IOException { GenericUDF udf = new SmartcnUDF(); ObjectInspector[] argOIs = new ObjectInspector[2]; // line @@ -60,7 +60,7 @@ public class SmartcnUDFTest { } @Test - public void testEvalauteOneRow() throws IOException, HiveException { + public void testEvaluateOneRow() throws IOException, HiveException { SmartcnUDF udf = new SmartcnUDF(); ObjectInspector[] argOIs = new ObjectInspector[1]; // line http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/efc3a6de/resources/ddl/define-udfs.td.hql ---------------------------------------------------------------------- diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql index 1d11d1a..953a6ac 100644 --- a/resources/ddl/define-udfs.td.hql +++ b/resources/ddl/define-udfs.td.hql @@ -176,6 +176,7 @@ create temporary function train_regression as 'hivemall.regression.GeneralRegres -- NLP features create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF'; +create temporary function tokenize_cn as 'hivemall.nlp.tokenizer.SmartcnUDF'; -- Backward compatibilities create temporary function concat_array as 'hivemall.tools.array.ArrayConcatUDF';
