This is an automated email from the ASF dual-hosted git repository.
myui pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git
The following commit(s) were added to refs/heads/master by this push:
new b56c477 [HIVEMALL-305] Kuromoji Japanese tokenizer with Neologd
dictionary
b56c477 is described below
commit b56c477a20ef6d7be143cddc49d9f9f85e144b63
Author: Makoto Yui <[email protected]>
AuthorDate: Thu Apr 22 12:39:21 2021 +0900
[HIVEMALL-305] Kuromoji Japanese tokenizer with Neologd dictionary
## What changes were proposed in this pull request?
Add tokenize_ja_neologd UDF that uses Neologd dictionary for Kuromoji
tokenization.
## What type of PR is it?
Feature
## What is the Jira issue?
https://issues.apache.org/jira/browse/HIVEMALL-305
## How was this patch tested?
unit tests and manual tests on EMR
## How to use this feature?
```sql
tokenize_ja_neologd(text input, optional const text mode = "normal",
optional const array<string> stopWords, const array<string> stopTags, const
array<string> userDict)
select tokenize_ja_neologd("彼女はペンパイナッポーアッポーペンと恋ダンスを踊った。");
> ["彼女","ペンパイナッポーアッポーペン","恋ダンス","踊る"]
```
## Checklist
- [x] Did you apply source code formatter, i.e., `./bin/format_code.sh`,
for your commit?
- [x] Did you run system tests on Hive (or Spark)?
Author: Makoto Yui <[email protected]>
Closes #235 from myui/neologd.
---
.rat-excludes | 1 +
bin/update_ddls.sh | 12 +-
dist/pom.xml | 1 +
docs/gitbook/misc/funcs.md | 25 +++
docs/gitbook/misc/tokenizer.md | 49 +++--
nlp/pom.xml | 20 +-
.../{KuromojiUDF.java => KuromojiNEologdUDF.java} | 32 ++-
.../java/hivemall/nlp/tokenizer/KuromojiUDF.java | 14 +-
.../hivemall/nlp/tokenizer/tokenizer.properties | 2 +
...ojiUDFTest.java => KuromojiNEologdUDFTest.java} | 217 +++++----------------
.../hivemall/nlp/tokenizer/KuromojiUDFTest.java | 10 +
resources/ddl/define-additional.hive | 39 ----
resources/ddl/define-all-as-permanent.hive | 16 ++
resources/ddl/define-all.hive | 39 ++++
resources/ddl/define-all.spark | 38 ++++
.../java/hivemall/docs/FuncsListGeneratorMojo.java | 1 +
16 files changed, 273 insertions(+), 243 deletions(-)
diff --git a/.rat-excludes b/.rat-excludes
index fcb4b31..4593c7e 100644
--- a/.rat-excludes
+++ b/.rat-excludes
@@ -31,3 +31,4 @@ docs/gitbook/node_modules/**
**/derby.log
**/LICENSE-*.txt
**/Base91.java
+**/*.properties
diff --git a/bin/update_ddls.sh b/bin/update_ddls.sh
index 81c0516..fe54010 100755
--- a/bin/update_ddls.sh
+++ b/bin/update_ddls.sh
@@ -52,13 +52,5 @@ define_additional() {
read -p "Function name (e.g., 'hivemall_version'): " function_name
read -p "Class path (e.g., 'hivemall.HivemallVersionUDF'): " class_path
-prefix="$(echo "$class_path" | cut -d'.' -f1,2)"
-if [[ $prefix == 'hivemall.xgboost' ]]; then
- define_all_as_permanent
- define_additional
-elif [[ $prefix == 'hivemall.nlp' ]]; then
- define_additional
-else
- define_all
- define_all_as_permanent
-fi
+define_all
+define_all_as_permanent
diff --git a/dist/pom.xml b/dist/pom.xml
index 36de34a..80104ac 100644
--- a/dist/pom.xml
+++ b/dist/pom.xml
@@ -122,6 +122,7 @@
<include>org.apache.lucene:lucene-analyzers-smartcn</include>
<include>org.apache.lucene:lucene-analyzers-common</include>
<include>org.apache.lucene:lucene-core</include>
+
<include>io.github.myui:lucene-analyzers-kuromoji-neologd</include>
<!--
hivemall-xgboost -->
<include>org.apache.hivemall:hivemall-xgboost</include>
<include>io.github.myui:xgboost4j</include>
diff --git a/docs/gitbook/misc/funcs.md b/docs/gitbook/misc/funcs.md
index b3e1006..84f167c 100644
--- a/docs/gitbook/misc/funcs.md
+++ b/docs/gitbook/misc/funcs.md
@@ -1050,6 +1050,31 @@ Reference: <a
href="https://papers.nips.cc/paper/3848-adaptive-regularization-of
- `tfidf(double termFrequency, long numDocs, const long totalNumDocs)` -
Return a smoothed TFIDF score in double.
+# NLP
+
+- `stoptags_exclude(array<string> excludeTags, [, const string lang='ja'])` -
Returns stoptags excluding given tags
+ ```sql
+ SELECT stoptags_exclude(array('名詞-固有名詞', '形容詞'))
+ ```
+
+- `tokenize_cn(String line [, const list<string> stopWords])` - returns
tokenized strings in array<string>
+
+- `tokenize_ja(String line [, const string mode = "normal", const
array<string> stopWords, const array<string> stopTags, const array<string>
userDict (or string userDictURL)`]) - returns tokenized strings in
array<string>
+ ```sql
+ select
tokenize_ja("kuromojiを使った分かち書きのテストです。第二引数にはnormal/search/extendedを指定できます。デフォルトではnormalモードです。");
+
+ >
["kuromoji","使う","分かち書き","テスト","第","二","引数","normal","search","extended","指定","デフォルト","normal","
モード"]
+
+ ```
+
+- `tokenize_ja_neologd(String line [, const string mode = "normal", const
array<string> stopWords, const array<string> stopTags, const array<string>
userDict (or string userDictURL)`]) - returns tokenized strings in
array<string>
+ ```sql
+ select
tokenize_ja_neologd("kuromojiを使った分かち書きのテストです。第二引数にはnormal/search/extendedを指定できます。デフォルトではnormalモードです。");
+
+ >
["kuromoji","使う","分かち書き","テスト","第","二","引数","normal","search","extended","指定","デフォルト","normal","
モード"]
+
+ ```
+
# Others
- `hivemall_version()` - Returns the version of Hivemall
diff --git a/docs/gitbook/misc/tokenizer.md b/docs/gitbook/misc/tokenizer.md
index 0ecc9bf..78d9970 100644
--- a/docs/gitbook/misc/tokenizer.md
+++ b/docs/gitbook/misc/tokenizer.md
@@ -28,14 +28,6 @@ tokenize(text input, optional boolean toLowerCase = false)
# Tokenizer for Non-English Texts
-Hivemall-NLP module provides some Non-English Text tokenizer UDFs as follows.
-
-First of all, you need to issue the following DDLs to use the NLP module. Note
NLP module is not included in `hivemall-with-dependencies.jar`.
-
-> add jar /path/to/hivemall-nlp-xxx-with-dependencies.jar;
-
-> source /path/to/define-additional.hive;
-
## Japanese Tokenizer
Japanese text tokenizer UDF uses
[Kuromoji](https://github.com/atilika/kuromoji).
@@ -43,16 +35,42 @@ Japanese text tokenizer UDF uses
[Kuromoji](https://github.com/atilika/kuromoji)
The signature of the UDF is as follows:
```sql
+-- uses Kuromoji default dictionary by the default
tokenize_ja(text input, optional const text mode = "normal", optional const
array<string> stopWords, const array<string> stopTags, const array<string>
userDict)
+
+-- tokenize_ja_neologd uses mecab-ipa-neologd for it's dictionary.
+tokenize_ja_neologd(text input, optional const text mode = "normal", optional
const array<string> stopWords, const array<string> stopTags, const
array<string> userDict)
```
> #### Note
-> `tokenize_ja` is supported since Hivemall v0.4.1, and the fifth argument is
supported since v0.5-rc.1 and later.
+> `tokenize_ja_neologd` returns tokenized strings in an array by using the
NEologd dictionary.
[mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd) is a
customized system dictionary for MeCab inclucing new vocablaries extracted from
many resources on the Web.
+
+See differences between with and without Neologd as follows:
+
+```sql
+select tokenize_ja("彼女はペンパイナッポーアッポーペンと恋ダンスを踊った。");
+>["彼女","ペンパイナッポーアッポーペン","恋","ダンス","踊る"]
+
+select tokenize_ja_neologd("彼女はペンパイナッポーアッポーペンと恋ダンスを踊った。");
+> ["彼女","ペンパイナッポーアッポーペン","恋ダンス","踊る"]
+```
+
+You can print versions for Kuromoji UDFs as follows:
+
+```sql
+select tokenize_ja();
+> ["8.8.2"]
+
+select tokenize_ja_neologd();
+> ["8.8.2-20200910.2"]
+```
Its basic usage is as follows:
+
```sql
select
tokenize_ja("kuromojiを使った分かち書きのテストです。第二引数にはnormal/search/extendedを指定できます。デフォルトではnormalモードです。");
```
+
> ["kuromoji","使う","分かち書き","テスト","第","二","引数","normal","search","extended","指定","デフォルト","normal","モード"]
In addition, the third and fourth argument respectively allow you to use your
own list of stop words and stop tags. For example, the following query simply
ignores "kuromoji" (as a stop word) and noun word "分かち書き" (as a stop tag):
@@ -70,10 +88,10 @@ select tokenize_ja("kuromojiを使った分かち書きのテストです。", "
`stoptags_exclude(array<string> tags, [, const string lang='ja'])` is a useful
UDF for getting
[stoptags](https://github.com/apache/lucene-solr/blob/master/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stoptags.txt)
excluding given part-of-speech tags as seen below:
-
```sql
select stoptags_exclude(array("名詞-固有名詞"));
```
+
> ["その他","その他-間投","フィラー","副詞","副詞-一般","副詞-助詞類接続","助動詞","助詞","助詞-並立助詞"
,"助詞-係助詞","助詞-副助詞","助詞-副助詞/並立助詞/終助詞","助詞-副詞化","助詞-接続助詞","助詞-格助詞
","助詞-格助詞-一般","助詞-格助詞-引用","助詞-格助詞-連語","助詞-特殊","助詞-終助詞","助詞-連体化","助
@@ -106,16 +124,19 @@ If you have a large custom dictionary as an external
file, `userDict` can also b
```sql
select tokenize_ja("日本経済新聞&関西国際空港", "normal", null, null,
"https://raw.githubusercontent.com/atilika/kuromoji/909fd6b32bf4e9dc86b7599de5c9b50ca8f004a1/kuromoji-core/src/test/resources/userdict.txt");
-```
> ["日本","経済","新聞","関西","国際","空港"]
+```
-Dictionary SHOULD be accessible through http/https protocol. And, it SHOULD be
compressed using gzip with `.gz` suffix because the maximum dictionary size is
limited to 32MB and read timeout is set to 60 sec. Also, connection must be
established in 10 sec.
-
-If you want to use HTTP Basic Authentication, please use the following form:
`https://user:[email protected]/my_dict.txt.gz` (see Sec 3.1 of
[rfc1738](https://www.ietf.org/rfc/rfc1738.txt))
+> #### Note
+> Dictionary SHOULD be accessible through http/https protocol. And, it SHOULD
be compressed using gzip with `.gz` suffix because the maximum dictionary size
is limited to 32MB and read timeout is set to 60 sec. Also, connection must be
established in 10 sec.
+>
+> If you want to use HTTP Basic Authentication, please use the following form:
`https://user:[email protected]/my_dict.txt.gz` (see Sec 3.1 of
[rfc1738](https://www.ietf.org/rfc/rfc1738.txt))
For detailed APIs, please refer Javadoc of
[JapaneseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-kuromoji/org/apache/lucene/analysis/ja/JapaneseAnalyzer.html)
as well.
+
+
## Part-of-speech
From Hivemall v0.6.0, the second argument can also accept the following option
format:
diff --git a/nlp/pom.xml b/nlp/pom.xml
index 0324ca1..55afab7 100644
--- a/nlp/pom.xml
+++ b/nlp/pom.xml
@@ -16,7 +16,9 @@
specific language governing permissions and limitations
under the License.
-->
-<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
@@ -33,6 +35,7 @@
<properties>
<main.basedir>${project.parent.basedir}</main.basedir>
<lucene.version>8.8.2</lucene.version>
+
<lucene-analyzers-kuromoji-neologd.version>8.8.2-20200910.2</lucene-analyzers-kuromoji-neologd.version>
</properties>
<dependencies>
@@ -109,6 +112,12 @@
<version>${lucene.version}</version>
<scope>compile</scope>
</dependency>
+ <dependency>
+ <groupId>io.github.myui</groupId>
+
<artifactId>lucene-analyzers-kuromoji-neologd</artifactId>
+
<version>${lucene-analyzers-kuromoji-neologd.version}</version>
+ <scope>compile</scope>
+ </dependency>
<!-- test scope -->
<dependency>
@@ -125,4 +134,13 @@
</dependencies>
+ <build>
+ <resources>
+ <resource>
+ <directory>src/main/resources</directory>
+ <filtering>true</filtering>
+ </resource>
+ </resources>
+ </build>
+
</project>
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java
similarity index 92%
copy from nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
copy to nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java
index 879c1a5..f3303a0 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java
@@ -40,6 +40,7 @@ import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
+import java.util.Properties;
import java.util.Set;
import javax.annotation.Nonnull;
@@ -57,21 +58,21 @@ import
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectIn
import org.apache.hadoop.io.Text;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
-import org.apache.lucene.analysis.ja.JapaneseTokenizer;
-import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
-import org.apache.lucene.analysis.ja.dict.UserDictionary;
-import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
+import org.apache.lucene.analysis.ja.neologd.JapaneseAnalyzer;
+import org.apache.lucene.analysis.ja.neologd.JapaneseTokenizer;
+import org.apache.lucene.analysis.ja.neologd.JapaneseTokenizer.Mode;
+import org.apache.lucene.analysis.ja.neologd.dict.UserDictionary;
+import
org.apache.lucene.analysis.ja.neologd.tokenattributes.PartOfSpeechAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-@Description(name = "tokenize_ja",
+@Description(name = "tokenize_ja_neologd",
value = "_FUNC_(String line [, const string mode = \"normal\", const
array<string> stopWords, const array<string> stopTags, const array<string>
userDict (or string userDictURL)])"
+ " - returns tokenized strings in array<string>",
- extended = "select
tokenize_ja(\"kuromojiを使った分かち書きのテストです。第二引数にはnormal/search/extendedを指定できます。デフォルトではnormalモードです。\");\n"
+ extended = "select
tokenize_ja_neologd(\"kuromojiを使った分かち書きのテストです。第二引数にはnormal/search/extendedを指定できます。デフォルトではnormalモードです。\");\n"
+ "\n"
+ ">
[\"kuromoji\",\"使う\",\"分かち書き\",\"テスト\",\"第\",\"二\",\"引数\",\"normal\",\"search\",\"extended\",\"指定\",\"デフォルト\",\"normal\",\"
モード\"]\n")
@UDFType(deterministic = true, stateful = false)
-public final class KuromojiUDF extends UDFWithOptions {
+public final class KuromojiNEologdUDF extends UDFWithOptions {
private static final int CONNECT_TIMEOUT_MS = 10000; // 10 sec
private static final int READ_TIMEOUT_MS = 60000; // 60 sec
private static final long MAX_INPUT_STREAM_SIZE = 32L * 1024L * 1024L; //
~32MB
@@ -111,7 +112,7 @@ public final class KuromojiUDF extends UDFWithOptions {
@Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws
UDFArgumentException {
final int arglen = arguments.length;
- if (arglen < 1 || arglen > 5) {
+ if (arglen > 5) {
showHelp("Invalid number of arguments for `tokenize_ja`: " +
arglen);
}
@@ -166,6 +167,17 @@ public final class KuromojiUDF extends UDFWithOptions {
@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
+ if (arguments.length == 0) {
+ final Properties properties = new Properties();
+ try {
+
properties.load(this.getClass().getResourceAsStream("tokenizer.properties"));
+ } catch (IOException e) {
+ throw new HiveException("Failed to read tokenizer.properties");
+ }
+ return Collections.singletonList(
+ new
Text(properties.getProperty("tokenizer_ja_neologd.version")));
+ }
+
if (_analyzer == null) {
CharArraySet stopWords = stopWords(_stopWordsArray);
@@ -401,7 +413,7 @@ public final class KuromojiUDF extends UDFWithOptions {
@Override
public String getDisplayString(String[] children) {
- return "tokenize_ja(" + Arrays.toString(children) + ')';
+ return "tokenize_ja_neologd(" + Arrays.toString(children) + ')';
}
}
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
index 879c1a5..7902f60 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
@@ -40,6 +40,7 @@ import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
+import java.util.Properties;
import java.util.Set;
import javax.annotation.Nonnull;
@@ -111,7 +112,7 @@ public final class KuromojiUDF extends UDFWithOptions {
@Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws
UDFArgumentException {
final int arglen = arguments.length;
- if (arglen < 1 || arglen > 5) {
+ if (arglen > 5) {
showHelp("Invalid number of arguments for `tokenize_ja`: " +
arglen);
}
@@ -166,6 +167,17 @@ public final class KuromojiUDF extends UDFWithOptions {
@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
+ if (arguments.length == 0) {
+ final Properties properties = new Properties();
+ try {
+
properties.load(this.getClass().getResourceAsStream("tokenizer.properties"));
+ } catch (IOException e) {
+ throw new HiveException("Failed to read tokenizer.properties");
+ }
+ return Collections.singletonList(
+ new Text(properties.getProperty("tokenizer_ja.version")));
+ }
+
if (_analyzer == null) {
CharArraySet stopWords = stopWords(_stopWordsArray);
diff --git a/nlp/src/main/resources/hivemall/nlp/tokenizer/tokenizer.properties
b/nlp/src/main/resources/hivemall/nlp/tokenizer/tokenizer.properties
new file mode 100644
index 0000000..46e5acd
--- /dev/null
+++ b/nlp/src/main/resources/hivemall/nlp/tokenizer/tokenizer.properties
@@ -0,0 +1,2 @@
+tokenizer_ja.version=${lucene.version}
+tokenizer_ja_neologd.version=${lucene-analyzers-kuromoji-neologd.version}
diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiNEologdUDFTest.java
similarity index 51%
copy from nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
copy to nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiNEologdUDFTest.java
index 2a3de26..d6a8ed9 100644
--- a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
+++ b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiNEologdUDFTest.java
@@ -18,13 +18,8 @@
*/
package hivemall.nlp.tokenizer;
-import hivemall.TestUtils;
-import hivemall.utils.hadoop.HiveUtils;
-import hivemall.utils.lang.PrivilegedAccessor;
-
import java.io.IOException;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.List;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
@@ -36,16 +31,24 @@ import
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.io.Text;
-import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
-import org.hamcrest.CoreMatchers;
import org.junit.Assert;
import org.junit.Test;
-public class KuromojiUDFTest {
+public class KuromojiNEologdUDFTest {
+
+ @Test
+ public void testNoArgument() throws IOException, HiveException {
+ GenericUDF udf = new KuromojiNEologdUDF();
+ ObjectInspector[] argOIs = new ObjectInspector[0];
+ udf.initialize(argOIs);
+ Object result = udf.evaluate(new DeferredObject[0]);
+ Assert.assertNotNull(result);
+ udf.close();
+ }
@Test
public void testOneArgument() throws UDFArgumentException, IOException {
- GenericUDF udf = new KuromojiUDF();
+ GenericUDF udf = new KuromojiNEologdUDF();
ObjectInspector[] argOIs = new ObjectInspector[1];
// line
argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
@@ -55,7 +58,7 @@ public class KuromojiUDFTest {
@Test
public void testTwoArgument() throws UDFArgumentException, IOException {
- GenericUDF udf = new KuromojiUDF();
+ GenericUDF udf = new KuromojiNEologdUDF();
ObjectInspector[] argOIs = new ObjectInspector[2];
// line
argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
@@ -69,7 +72,7 @@ public class KuromojiUDFTest {
}
public void testExpectedMode() throws UDFArgumentException, IOException {
- GenericUDF udf = new KuromojiUDF();
+ GenericUDF udf = new KuromojiNEologdUDF();
ObjectInspector[] argOIs = new ObjectInspector[2];
// line
argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
@@ -84,7 +87,7 @@ public class KuromojiUDFTest {
@Test(expected = UDFArgumentException.class)
public void testInvalidMode() throws IOException, HiveException {
- GenericUDF udf = new KuromojiUDF();
+ GenericUDF udf = new KuromojiNEologdUDF();
ObjectInspector[] argOIs = new ObjectInspector[2];
// line
argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
@@ -98,7 +101,7 @@ public class KuromojiUDFTest {
DeferredObject[] args = new DeferredObject[1];
args[0] = new DeferredObject() {
public Text get() throws HiveException {
- return new Text("クロモジのJapaneseAnalyzerを使ってみる。テスト。");
+ return new Text("こんにちは。");
}
@Override
@@ -111,7 +114,7 @@ public class KuromojiUDFTest {
@Test
public void testThreeArgument() throws UDFArgumentException, IOException {
- GenericUDF udf = new KuromojiUDF();
+ GenericUDF udf = new KuromojiNEologdUDF();
ObjectInspector[] argOIs = new ObjectInspector[3];
// line
argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
@@ -129,7 +132,7 @@ public class KuromojiUDFTest {
@Test
public void testFourArgument() throws UDFArgumentException, IOException {
- GenericUDF udf = new KuromojiUDF();
+ GenericUDF udf = new KuromojiNEologdUDF();
ObjectInspector[] argOIs = new ObjectInspector[4];
// line
argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
@@ -150,7 +153,7 @@ public class KuromojiUDFTest {
@Test
public void testFiveArgumentArray() throws UDFArgumentException,
IOException {
- GenericUDF udf = new KuromojiUDF();
+ GenericUDF udf = new KuromojiNEologdUDF();
ObjectInspector[] argOIs = new ObjectInspector[5];
// line
argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
@@ -173,8 +176,8 @@ public class KuromojiUDFTest {
}
@Test
- public void testFiveArgumenString() throws UDFArgumentException,
IOException {
- GenericUDF udf = new KuromojiUDF();
+ public void testFiveArgumentString() throws UDFArgumentException,
IOException {
+ GenericUDF udf = new KuromojiNEologdUDF();
ObjectInspector[] argOIs = new ObjectInspector[5];
// line
argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
@@ -196,9 +199,21 @@ public class KuromojiUDFTest {
udf.close();
}
+ @SuppressWarnings("unchecked")
+ @Test
+ public void testEvaluateReturnsVersion() throws IOException, HiveException
{
+ KuromojiNEologdUDF udf = new KuromojiNEologdUDF();
+ DeferredObject[] args = new DeferredObject[0];
+ List<Text> tokens = (List<Text>) udf.evaluate(args);
+ Assert.assertNotNull(tokens);
+ Assert.assertEquals(1, tokens.size());
+ udf.close();
+ }
+
+ @SuppressWarnings("unchecked")
@Test
public void testEvaluateOneRow() throws IOException, HiveException {
- KuromojiUDF udf = new KuromojiUDF();
+ KuromojiNEologdUDF udf = new KuromojiNEologdUDF();
ObjectInspector[] argOIs = new ObjectInspector[1];
// line
argOIs[0] =
PrimitiveObjectInspectorFactory.writableStringObjectInspector;
@@ -207,23 +222,22 @@ public class KuromojiUDFTest {
DeferredObject[] args = new DeferredObject[1];
args[0] = new DeferredObject() {
public Text get() throws HiveException {
- return new Text("クロモジのJapaneseAnalyzerを使ってみる。テスト。");
+ return new
Text("10日放送の「中居正広のミになる図書館」(テレビ朝日系)で、SMAPの中居正広が、篠原信一の過去の勘違いを明かす一幕があった。");
}
@Override
public void prepare(int arg) throws HiveException {}
};
- @SuppressWarnings("unchecked")
List<Text> tokens = (List<Text>) udf.evaluate(args);
Assert.assertNotNull(tokens);
- Assert.assertEquals(5, tokens.size());
+ Assert.assertEquals(12, tokens.size());
udf.close();
}
@SuppressWarnings("unchecked")
@Test
public void testEvaluateTwoRows() throws IOException, HiveException {
- KuromojiUDF udf = new KuromojiUDF();
+ KuromojiNEologdUDF udf = new KuromojiNEologdUDF();
ObjectInspector[] argOIs = new ObjectInspector[1];
// line
argOIs[0] =
PrimitiveObjectInspectorFactory.writableStringObjectInspector;
@@ -232,7 +246,7 @@ public class KuromojiUDFTest {
DeferredObject[] args = new DeferredObject[1];
args[0] = new DeferredObject() {
public Text get() throws HiveException {
- return new Text("クロモジのJapaneseAnalyzerを使ってみる。テスト。");
+ return new
Text("10日放送の「中居正広のミになる図書館」(テレビ朝日系)で、SMAPの中居正広が、篠原信一の過去の勘違いを明かす一幕があった。");
}
@Override
@@ -240,11 +254,11 @@ public class KuromojiUDFTest {
};
List<Text> tokens = (List<Text>) udf.evaluate(args);
Assert.assertNotNull(tokens);
- Assert.assertEquals(5, tokens.size());
+ Assert.assertEquals(12, tokens.size());
args[0] = new DeferredObject() {
public Text get() throws HiveException {
- return new Text("クロモジのJapaneseAnalyzerを使ってみる。");
+ return new Text("きゃりーぱみゅぱみゅ。");
}
@Override
@@ -252,39 +266,15 @@ public class KuromojiUDFTest {
};
tokens = (List<Text>) udf.evaluate(args);
Assert.assertNotNull(tokens);
- Assert.assertEquals(4, tokens.size());
-
- udf.close();
- }
-
- @Test
- public void testEvaluateLongRow() throws IOException, HiveException {
- KuromojiUDF udf = new KuromojiUDF();
- ObjectInspector[] argOIs = new ObjectInspector[1];
- // line
- argOIs[0] =
PrimitiveObjectInspectorFactory.writableStringObjectInspector;
- udf.initialize(argOIs);
-
- DeferredObject[] args = new DeferredObject[1];
- args[0] = new DeferredObject() {
- public Text get() throws HiveException {
- return new Text(
- "商品の購入・詳細(サイズ、画像)は商品名をクリックしてください![L.B CANDY
STOCK]フラワービジューベアドレス[L.B DAILY STOCK]ボーダーニットトップス[L.B DAILY
STOCK]ボーダーロングニットOP[L.B DAILY STOCK]ロゴトートBAG[L.B DAILY
STOCK]裏毛ロゴプリントプルオーバー【TVドラマ着用】アンゴラワッフルカーディガン【TVドラマ着用】グラフィティーバックリボンワンピース【TVドラマ着用】ボーダーハイネックトップス【TVドラマ着用】レオパードミッドカーフスカート【セットアップ対応商品】起毛ニットスカート【セットアップ対応商品】起毛ニットプルオーバー2wayサングラス33ナンバーリングニット3Dショルダーフレアードレス3周年スリッパ3周年ラグマット3周年ロックグラスキャンドルLily
Brown 2015年 福�
��MIXニットプルオーバーPeckhamロゴニットアンゴラジャガードプルオーバーアンゴラタートルアンゴラチュニックアンゴラニットカーディガンアンゴラ
[...]
- }
+ Assert.assertEquals(1, tokens.size());
- @Override
- public void prepare(int arg) throws HiveException {}
- };
- @SuppressWarnings("unchecked")
- List<Text> tokens = (List<Text>) udf.evaluate(args);
- Assert.assertNotNull(tokens);
- Assert.assertEquals(182, tokens.size());
udf.close();
}
+ @SuppressWarnings("unchecked")
@Test
public void testEvaluateUserDictArray() throws IOException, HiveException {
- KuromojiUDF udf = new KuromojiUDF();
+ KuromojiNEologdUDF udf = new KuromojiNEologdUDF();
ObjectInspector[] argOIs = new ObjectInspector[5];
// line
argOIs[0] =
PrimitiveObjectInspectorFactory.writableStringObjectInspector;
@@ -317,7 +307,6 @@ public class KuromojiUDFTest {
public void prepare(int arg) throws HiveException {}
};
- @SuppressWarnings("unchecked")
List<Text> tokens = (List<Text>) udf.evaluate(args);
Assert.assertNotNull(tokens);
@@ -326,9 +315,10 @@ public class KuromojiUDFTest {
udf.close();
}
+ @SuppressWarnings("unchecked")
@Test(expected = UDFArgumentException.class)
public void testEvaluateInvalidUserDictURL() throws IOException,
HiveException {
- KuromojiUDF udf = new KuromojiUDF();
+ KuromojiNEologdUDF udf = new KuromojiNEologdUDF();
ObjectInspector[] argOIs = new ObjectInspector[5];
// line
argOIs[0] =
PrimitiveObjectInspectorFactory.writableStringObjectInspector;
@@ -351,23 +341,23 @@ public class KuromojiUDFTest {
DeferredObject[] args = new DeferredObject[1];
args[0] = new DeferredObject() {
public Text get() throws HiveException {
- return new Text("クロモジのJapaneseAnalyzerを使ってみる。テスト。");
+ return new Text("こんにちは。");
}
@Override
public void prepare(int arg) throws HiveException {}
};
- @SuppressWarnings("unchecked")
List<Text> tokens = (List<Text>) udf.evaluate(args);
Assert.assertNotNull(tokens);
udf.close();
}
+ @SuppressWarnings("unchecked")
@Test
public void testEvaluateUserDictURL() throws IOException, HiveException {
- KuromojiUDF udf = new KuromojiUDF();
+ KuromojiNEologdUDF udf = new KuromojiNEologdUDF();
ObjectInspector[] argOIs = new ObjectInspector[5];
// line
argOIs[0] =
PrimitiveObjectInspectorFactory.writableStringObjectInspector;
@@ -399,118 +389,9 @@ public class KuromojiUDFTest {
public void prepare(int arg) throws HiveException {}
};
- @SuppressWarnings("unchecked")
- List<Text> tokens = (List<Text>) udf.evaluate(args);
-
- Assert.assertNotNull(tokens);
- Assert.assertEquals(7, tokens.size());
-
- udf.close();
- }
-
- @Test
- public void testSerialization() throws IOException, HiveException {
- final KuromojiUDF udf = new KuromojiUDF();
- ObjectInspector[] argOIs = new ObjectInspector[1];
- argOIs[0] =
PrimitiveObjectInspectorFactory.writableStringObjectInspector;
- udf.initialize(argOIs);
-
- // serialization after initialization
- byte[] serialized = TestUtils.serializeObjectByKryo(udf);
- TestUtils.deserializeObjectByKryo(serialized, KuromojiUDF.class);
-
- DeferredObject[] args = new DeferredObject[1];
- args[0] = new DeferredObject() {
- public Text get() throws HiveException {
- return new Text("クロモジのJapaneseAnalyzerを使ってみる。テスト。");
- }
-
- @Override
- public void prepare(int arg) throws HiveException {}
- };
- @SuppressWarnings("unchecked")
List<Text> tokens = (List<Text>) udf.evaluate(args);
Assert.assertNotNull(tokens);
-
- // serialization after evaluation
- serialized = TestUtils.serializeObjectByKryo(udf);
- TestUtils.deserializeObjectByKryo(serialized, KuromojiUDF.class);
-
- udf.close();
- }
-
- @Test
- public void testNormalModeWithOption()
- throws IOException, HiveException, IllegalAccessException,
NoSuchFieldException {
- GenericUDF udf = new KuromojiUDF();
- ObjectInspector[] argOIs = new ObjectInspector[2];
-
- argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
// line
- argOIs[1] = HiveUtils.getConstStringObjectInspector("-mode normal");
// mode
- udf.initialize(argOIs);
-
- Object mode = PrivilegedAccessor.getValue(udf, "_mode");
- Assert.assertEquals(Mode.NORMAL, mode);
-
- DeferredObject[] args = new DeferredObject[1];
- args[0] = new DeferredObject() {
- public Text get() throws HiveException {
- return new Text("クロモジのJapaneseAnalyzerを使ってみる。テスト。");
- }
-
- @Override
- public void prepare(int arg) throws HiveException {}
- };
- Object result = udf.evaluate(args);
- Assert.assertThat(Arrays.asList(new Text("クロモジ"), new
Text("japaneseanalyzer"),
- new Text("使う"), new Text("みる"), new Text("テスト")),
CoreMatchers.is(result));
-
- udf.close();
- }
-
- @Test
- public void testNormalModeWithPosOptions()
- throws IOException, HiveException, IllegalAccessException,
NoSuchFieldException {
- GenericUDF udf = new KuromojiUDF();
- ObjectInspector[] argOIs = new ObjectInspector[2];
-
- argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
// line
- argOIs[1] = HiveUtils.getConstStringObjectInspector("-mode normal
-pos"); // mode
- udf.initialize(argOIs);
-
- Object mode = PrivilegedAccessor.getValue(udf, "_mode");
- Assert.assertEquals(Mode.NORMAL, mode);
-
- DeferredObject[] args = new DeferredObject[1];
- args[0] = new DeferredObject() {
- public Text get() throws HiveException {
- return new Text("クロモジのJapaneseAnalyzerを使ってみる。テスト。");
- }
-
- @Override
- public void prepare(int arg) throws HiveException {}
- };
-
- Object[] result = (Object[]) udf.evaluate(args);
- Assert.assertEquals(2, result.length);
-
- Assert.assertEquals(Arrays.asList(new Text("クロモジ"), new
Text("japaneseanalyzer"),
- new Text("使う"), new Text("みる"), new Text("テスト")), result[0]);
- Assert.assertEquals(Arrays.asList(new Text("名詞-一般"), new
Text("名詞-一般"), new Text("動詞-自立"),
- new Text("動詞-非自立"), new Text("名詞-サ変接続")), result[1]);
-
- udf.close();
- }
-
- @Test(expected = UDFArgumentException.class)
- public void testUnsupportedOptionArgs()
- throws IOException, HiveException, IllegalAccessException,
NoSuchFieldException {
- GenericUDF udf = new KuromojiUDF();
- ObjectInspector[] argOIs = new ObjectInspector[2];
-
- argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
// line
- argOIs[1] = HiveUtils.getConstStringObjectInspector("-mode normal
-unsupported_option"); // mode
- udf.initialize(argOIs);
+ Assert.assertEquals(8, tokens.size());
udf.close();
}
diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
index 2a3de26..480c5e9 100644
--- a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
+++ b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
@@ -44,6 +44,16 @@ import org.junit.Test;
public class KuromojiUDFTest {
@Test
+ public void testNoArgument() throws IOException, HiveException {
+ GenericUDF udf = new KuromojiUDF();
+ ObjectInspector[] argOIs = new ObjectInspector[0];
+ udf.initialize(argOIs);
+ Object result = udf.evaluate(new DeferredObject[0]);
+ Assert.assertNotNull(result);
+ udf.close();
+ }
+
+ @Test
public void testOneArgument() throws UDFArgumentException, IOException {
GenericUDF udf = new KuromojiUDF();
ObjectInspector[] argOIs = new ObjectInspector[1];
diff --git a/resources/ddl/define-additional.hive
b/resources/ddl/define-additional.hive
deleted file mode 100644
index 802245d..0000000
--- a/resources/ddl/define-additional.hive
+++ /dev/null
@@ -1,39 +0,0 @@
------------------------------------------------------------------------------
--- Hivemall: Hive scalable Machine Learning Library
------------------------------------------------------------------------------
-
-------------------
--- NLP features --
-------------------
-
-drop temporary function if exists tokenize_ja;
-create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';
-
-drop temporary function if exists tokenize_cn;
-create temporary function tokenize_cn as 'hivemall.nlp.tokenizer.SmartcnUDF';
-
-drop temporary function if exists stoptags_exclude;
-create temporary function stoptags_exclude as
'hivemall.nlp.tokenizer.StoptagsExcludeUDF';
-
-------------------------------
--- XGBoost related features --
-------------------------------
-
-drop temporary function if exists xgboost_version;
-create temporary function xgboost_version as
'hivemall.xgboost.XGBoostVersionUDF';
-
-drop temporary function if exists train_xgboost;
-create temporary function train_xgboost as 'hivemall.xgboost.XGBoostTrainUDTF';
-
-drop temporary function if exists xgboost_predict;
-create temporary function xgboost_predict as
'hivemall.xgboost.XGBoostOnlinePredictUDTF';
-
-drop temporary function if exists xgboost_batch_predict;
-create temporary function xgboost_batch_predict as
'hivemall.xgboost.XGBoostBatchPredictUDTF';
-
-drop temporary function if exists xgboost_predict_one;
-create temporary function xgboost_predict_one as
'hivemall.xgboost.XGBoostPredictOneUDTF';
-
-drop temporary function if exists xgboost_predict_triple;
-create temporary function xgboost_predict_triple as
'hivemall.xgboost.XGBoostPredictTripleUDTF';
-
diff --git a/resources/ddl/define-all-as-permanent.hive
b/resources/ddl/define-all-as-permanent.hive
index c5f2669..209f161 100644
--- a/resources/ddl/define-all-as-permanent.hive
+++ b/resources/ddl/define-all-as-permanent.hive
@@ -902,6 +902,22 @@ CREATE FUNCTION min_by as 'hivemall.tools.aggr.MinByUDAF'
USING JAR '${hivemall_
DROP FUNCTION IF EXISTS majority_vote;
CREATE FUNCTION majority_vote as 'hivemall.tools.aggr.MajorityVoteUDAF' USING
JAR '${hivemall_jar}';
+------------------
+-- NLP features --
+------------------
+
+DROP FUNCTION IF EXISTS tokenize_ja;
+CREATE FUNCTION tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF' USING JAR
'${hivemall_jar}';
+
+DROP FUNCTION IF EXISTS tokenize_ja_neologd;
+CREATE FUNCTION tokenize_ja_neologd as
'hivemall.nlp.tokenizer.KuromojiNEologdUDF' USING JAR '${hivemall_jar}';
+
+DROP FUNCTION IF EXISTS tokenize_cn;
+CREATE FUNCTION tokenize_cn as 'hivemall.nlp.tokenizer.SmartcnUDF' USING JAR
'${hivemall_jar}';
+
+DROP FUNCTION IF EXISTS stoptags_exclude;
+CREATE FUNCTION stoptags_exclude as
'hivemall.nlp.tokenizer.StoptagsExcludeUDF' USING JAR '${hivemall_jar}';
+
------------------------------
-- XGBoost related features --
------------------------------
diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive
index 8bf36e8..0e89507 100644
--- a/resources/ddl/define-all.hive
+++ b/resources/ddl/define-all.hive
@@ -893,3 +893,42 @@ create temporary function min_by as
'hivemall.tools.aggr.MinByUDAF';
drop temporary function if exists majority_vote;
create temporary function majority_vote as
'hivemall.tools.aggr.MajorityVoteUDAF';
+
+------------------
+-- NLP features --
+------------------
+
+drop temporary function if exists tokenize_ja;
+create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';
+
+drop temporary function if exists tokenize_ja_neologd;
+create temporary function tokenize_ja_neologd as
'hivemall.nlp.tokenizer.KuromojiNEologdUDF';
+
+drop temporary function if exists tokenize_cn;
+create temporary function tokenize_cn as 'hivemall.nlp.tokenizer.SmartcnUDF';
+
+drop temporary function if exists stoptags_exclude;
+create temporary function stoptags_exclude as
'hivemall.nlp.tokenizer.StoptagsExcludeUDF';
+
+------------------------------
+-- XGBoost related features --
+------------------------------
+
+drop temporary function if exists xgboost_version;
+create temporary function xgboost_version as
'hivemall.xgboost.XGBoostVersionUDF';
+
+drop temporary function if exists train_xgboost;
+create temporary function train_xgboost as 'hivemall.xgboost.XGBoostTrainUDTF';
+
+drop temporary function if exists xgboost_predict;
+create temporary function xgboost_predict as
'hivemall.xgboost.XGBoostOnlinePredictUDTF';
+
+drop temporary function if exists xgboost_batch_predict;
+create temporary function xgboost_batch_predict as
'hivemall.xgboost.XGBoostBatchPredictUDTF';
+
+drop temporary function if exists xgboost_predict_one;
+create temporary function xgboost_predict_one as
'hivemall.xgboost.XGBoostPredictOneUDTF';
+
+drop temporary function if exists xgboost_predict_triple;
+create temporary function xgboost_predict_triple as
'hivemall.xgboost.XGBoostPredictTripleUDTF';
+
diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark
index 91c6350..6eb2921 100644
--- a/resources/ddl/define-all.spark
+++ b/resources/ddl/define-all.spark
@@ -879,3 +879,41 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION min_by AS
'hivemall.tools.aggr.MinByUD
sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS majority_vote")
sqlContext.sql("CREATE TEMPORARY FUNCTION majority_vote AS
'hivemall.tools.aggr.MajorityVoteUDAF'")
+
+/**
+ * NLP features
+ */
+
+sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS tokenize_ja")
+sqlContext.sql("CREATE TEMPORARY FUNCTION tokenize_ja AS
'hivemall.nlp.tokenizer.KuromojiUDF'")
+
+sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS tokenize_ja_neologd")
+sqlContext.sql("CREATE TEMPORARY FUNCTION tokenize_ja_neologd AS
'hivemall.nlp.tokenizer.KuromojiNEologdUDF'")
+
+sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS tokenize_cn")
+sqlContext.sql("CREATE TEMPORARY FUNCTION tokenize_cn AS
'hivemall.nlp.tokenizer.SmartcnUDF'")
+
+sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS stoptags_exclude")
+sqlContext.sql("CREATE TEMPORARY FUNCTION stoptags_exclude AS
'hivemall.nlp.tokenizer.StoptagsExcludeUDF'")
+
+/**
+ * XGBoost related features
+ */
+
+sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS xgboost_version")
+sqlContext.sql("CREATE TEMPORARY FUNCTION xgboost_version AS
'hivemall.xgboost.XGBoostVersionUDF'")
+
+sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS train_xgboost")
+sqlContext.sql("CREATE TEMPORARY FUNCTION train_xgboost AS
'hivemall.xgboost.XGBoostTrainUDTF'")
+
+sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS xgboost_predict")
+sqlContext.sql("CREATE TEMPORARY FUNCTION xgboost_predict AS
'hivemall.xgboost.XGBoostOnlinePredictUDTFF'")
+
+sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS xgboost_batch_predict")
+sqlContext.sql("CREATE TEMPORARY FUNCTION xgboost_batch_predict AS
'hivemall.xgboost.XGBoostBatchPredictUDTF'")
+
+sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS xgboost_predict_one")
+sqlContext.sql("CREATE TEMPORARY FUNCTION xgboost_predict_one AS
'hivemall.xgboost.XGBoostPredictOneUDTF'")
+
+sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS xgboost_predict_triple")
+sqlContext.sql("CREATE TEMPORARY FUNCTION xgboost_predict_triple AS
'hivemall.xgboost.XGBoostPredictTripleUDTF'")
diff --git
a/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGeneratorMojo.java
b/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGeneratorMojo.java
index c383bd9..df57378 100644
---
a/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGeneratorMojo.java
+++
b/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGeneratorMojo.java
@@ -147,6 +147,7 @@ public class FuncsListGeneratorMojo extends AbstractMojo {
"hivemall.smile.classification", "hivemall.smile.regression",
"hivemall.smile.tools"));
funcsHeaders.put("# XGBoost", Arrays.asList("hivemall.xgboost"));
funcsHeaders.put("# Term Vector Model",
Collections.singletonList("hivemall.ftvec.text"));
+ funcsHeaders.put("# NLP",
Collections.singletonList("hivemall.nlp.tokenizer"));
funcsHeaders.put("# Others",
Arrays.asList("hivemall", "hivemall.dataset",
"hivemall.ftvec.text"));
}