Repository: incubator-hivemall Updated Branches: refs/heads/master e3bbaf622 -> 3cbc6647e
Close #97: [HIVEMALL-130] Support user-defined dictionary for tokenize_ja Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/3cbc6647 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/3cbc6647 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/3cbc6647 Branch: refs/heads/master Commit: 3cbc6647e8e502f71dd4da2baf599edc606db895 Parents: e3bbaf6 Author: Takuya Kitazawa <[email protected]> Authored: Sat Jul 15 01:04:26 2017 +0900 Committer: Makoto Yui <[email protected]> Committed: Sat Jul 15 02:45:40 2017 +0900 ---------------------------------------------------------------------- .../java/hivemall/utils/hadoop/HiveUtils.java | 10 ++ .../main/java/hivemall/utils/io/HttpUtils.java | 51 ++++++ .../main/java/hivemall/utils/io/IOUtils.java | 28 +++ .../hivemall/utils/io/LimitedInputStream.java | 87 ++++++++++ .../utils/io/LimitedInputStreamTest.java | 92 ++++++++++ docs/gitbook/misc/tokenizer.md | 48 ++++- .../hivemall/nlp/tokenizer/KuromojiUDF.java | 129 ++++++++++++-- .../java/hivemall/nlp/tokenizer/SmartcnUDF.java | 3 +- .../hivemall/nlp/tokenizer/KuromojiUDFTest.java | 173 +++++++++++++++++++ 9 files changed, 597 insertions(+), 24 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3cbc6647/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java b/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java index cb2b5e3..0b68de8 100644 --- a/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java +++ b/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java @@ -27,6 +27,7 @@ import static hivemall.HivemallConstants.INT_TYPE_NAME; import static hivemall.HivemallConstants.SMALLINT_TYPE_NAME; import static hivemall.HivemallConstants.STRING_TYPE_NAME; import static hivemall.HivemallConstants.TINYINT_TYPE_NAME; +import static hivemall.HivemallConstants.VOID_TYPE_NAME; import java.nio.charset.StandardCharsets; import java.util.Arrays; @@ -228,6 +229,11 @@ public final class HiveUtils { return oi.getCategory() == Category.STRUCT; } + public static boolean isVoidOI(@Nonnull final ObjectInspector oi) { + String typeName = oi.getTypeName(); + return VOID_TYPE_NAME.equals(typeName); + } + public static boolean isStringOI(@Nonnull final ObjectInspector oi) { String typeName = oi.getTypeName(); return STRING_TYPE_NAME.equals(typeName); @@ -303,6 +309,10 @@ public final class HiveUtils { && isNumberListOI(((ListObjectInspector) oi).getListElementObjectInspector()); } + public static boolean isConstListOI(@Nonnull final ObjectInspector oi) { + return ObjectInspectorUtils.isConstantObjectInspector(oi) && isListOI(oi); + } + public static boolean isConstString(@Nonnull final ObjectInspector oi) { return ObjectInspectorUtils.isConstantObjectInspector(oi) && isStringOI(oi); } http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3cbc6647/core/src/main/java/hivemall/utils/io/HttpUtils.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/utils/io/HttpUtils.java b/core/src/main/java/hivemall/utils/io/HttpUtils.java new file mode 100644 index 0000000..6994cfe --- /dev/null +++ b/core/src/main/java/hivemall/utils/io/HttpUtils.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.utils.io; + +import java.io.IOException; +import java.io.InputStream; +import java.net.HttpURLConnection; +import java.net.URL; +import java.net.URLConnection; + +import javax.annotation.Nonnegative; +import javax.annotation.Nonnull; + +public final class HttpUtils { + + private HttpUtils() {} + + @Nonnull + public static HttpURLConnection getHttpURLConnection(@Nonnull String urlStr) + throws IllegalArgumentException, IOException { + if (!urlStr.startsWith("http://") && !urlStr.startsWith("https://")) { + throw new IllegalArgumentException("Unexpected url: " + urlStr); + } + URL url = new URL(urlStr); + URLConnection conn = url.openConnection(); + return (HttpURLConnection) conn; + } + + @Nonnull + public static InputStream getLimitedInputStream(@Nonnull HttpURLConnection conn, + @Nonnegative long size) throws IOException { + InputStream is = conn.getInputStream(); + return new LimitedInputStream(is, size); + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3cbc6647/core/src/main/java/hivemall/utils/io/IOUtils.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/utils/io/IOUtils.java b/core/src/main/java/hivemall/utils/io/IOUtils.java index 919fe17..27d4b49 100644 --- a/core/src/main/java/hivemall/utils/io/IOUtils.java +++ b/core/src/main/java/hivemall/utils/io/IOUtils.java @@ -33,6 +33,8 @@ import java.io.InputStreamReader; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.OutputStream; +import java.io.PushbackInputStream; +import java.util.zip.GZIPInputStream; import javax.annotation.Nonnull; import javax.annotation.Nullable; @@ -129,6 +131,32 @@ public final class IOUtils { return ((ch1 << 24) + (ch2 << 16) + (ch3 << 8) + (ch4 << 0)); } + /** + * Look ahead InputStream and decompress it as GZIPInputStream if needed + * + * @link https://stackoverflow.com/a/4818946 + */ + @Nonnull + public static InputStream decodeInputStream(@Nonnull final InputStream is) throws IOException { + final PushbackInputStream pb = new PushbackInputStream(is, 2); + + // look ahead + final byte[] signature = new byte[2]; + final int nread = pb.read(signature); + // If no byte is available because the stream is at the end of the file, the value -1 is returned; + // otherwise, at least one byte is read and stored into b. + if (nread > 0) {// may be -1 (EOF) or 1 or 2 + pb.unread(signature, 0, nread); // push back + } + + final int streamHeader = ((int) signature[0] & 0xff) | ((signature[1] << 8) & 0xff00); + if (streamHeader == GZIPInputStream.GZIP_MAGIC) { + return new GZIPInputStream(pb); + } else { + return pb; + } + } + public static void writeChar(final char v, final OutputStream out) throws IOException { out.write(0xff & (v >> 8)); out.write(0xff & v); http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3cbc6647/core/src/main/java/hivemall/utils/io/LimitedInputStream.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/utils/io/LimitedInputStream.java b/core/src/main/java/hivemall/utils/io/LimitedInputStream.java new file mode 100644 index 0000000..f9bb07c --- /dev/null +++ b/core/src/main/java/hivemall/utils/io/LimitedInputStream.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.utils.io; + +import hivemall.utils.lang.Preconditions; + +import java.io.FilterInputStream; +import java.io.IOException; +import java.io.InputStream; + +import javax.annotation.CheckForNull; +import javax.annotation.Nonnegative; + +/** + * Input stream which is limited to a certain length. Implementation is based on LimitedInputStream + * in Apache Commons FileUpload. + * + * @link + * https://commons.apache.org/proper/commons-fileupload/apidocs/org/apache/commons/fileupload/util + * /LimitedInputStream.html + */ +public class LimitedInputStream extends FilterInputStream { + + protected final long max; + protected long pos = 0L; + + public LimitedInputStream(@CheckForNull final InputStream in, @Nonnegative final long maxSize) { + super(in); + Preconditions.checkNotNull(in, "Base input stream must not be null"); + this.max = maxSize; + } + + protected void raiseError() throws IOException { + throw new IOException("Exceeded maximum size of input stream: limit = " + max + + " bytes, but pos = " + pos); + } + + private void proceed(@Nonnegative final long bytes) throws IOException { + this.pos += bytes; + if (pos > max) { + raiseError(); + } + } + + @Override + public int read() throws IOException { + final int res = super.read(); + if (res != -1) { + proceed(1L); + } + return res; + } + + @Override + public int read(final byte[] b, final int off, final int len) throws IOException { + final int res = super.read(b, off, len); + if (res > 0) { + proceed(res); + } + return res; + } + + @Override + public long skip(final long n) throws IOException { + final long res = super.skip(n); + if (res > 0) { + proceed(res); + } + return res; + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3cbc6647/core/src/test/java/hivemall/utils/io/LimitedInputStreamTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/hivemall/utils/io/LimitedInputStreamTest.java b/core/src/test/java/hivemall/utils/io/LimitedInputStreamTest.java new file mode 100644 index 0000000..18d17bf --- /dev/null +++ b/core/src/test/java/hivemall/utils/io/LimitedInputStreamTest.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.utils.io; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; + +import org.junit.Assert; +import org.junit.Test; + +public class LimitedInputStreamTest { + + @Test + public void testExactSize() throws IOException { + String expected = "abcdef"; + int len = expected.length(); + + InputStream is = new FastByteArrayInputStream(expected.getBytes()); + LimitedInputStream isLimited = new LimitedInputStream(is, len); + + Reader reader = new InputStreamReader(isLimited); + BufferedReader br = new BufferedReader(reader); + + char[] buf = new char[len]; + br.read(buf); + + Assert.assertTrue(expected.equals(new String(buf))); + + br.close(); + } + + @Test + public void testLooseSize() throws IOException { + String expected = "abcdef"; + int len = expected.length(); + + InputStream is = new FastByteArrayInputStream(expected.getBytes()); + LimitedInputStream isLimited = new LimitedInputStream(is, len + 100); // large enough + + Reader reader = new InputStreamReader(isLimited); + BufferedReader br = new BufferedReader(reader); + + char[] buf = new char[len]; + br.read(buf); + + Assert.assertTrue(expected.equals(new String(buf))); + + br.close(); + } + + @Test(expected = IOException.class) + public void testExceed() throws IOException { + String expected = "abcdef"; + int len = expected.length(); + + InputStream is = new FastByteArrayInputStream(expected.getBytes()); + LimitedInputStream isLimited = new LimitedInputStream(is, len - 1); // not enough + + Reader reader = new InputStreamReader(isLimited); + BufferedReader br = new BufferedReader(reader); + + char[] buf = new char[len]; + br.read(buf); + + br.close(); + } + + @Test(expected = NullPointerException.class) + public void testNullInputStream() throws NullPointerException, IOException { + new LimitedInputStream(null, 100).close(); + } + +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3cbc6647/docs/gitbook/misc/tokenizer.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/misc/tokenizer.md b/docs/gitbook/misc/tokenizer.md index 99f281d..07c8cd1 100644 --- a/docs/gitbook/misc/tokenizer.md +++ b/docs/gitbook/misc/tokenizer.md @@ -28,28 +28,62 @@ tokenize(text input, optional boolean toLowerCase = false) Hivemall-NLP module provides some Non-English Text tokenizer UDFs as follows. -First of all, you need to issue the following DDLs to use the NLP module. Note NLP module is not included in [hivemall-with-dependencies.jar](https://github.com/myui/hivemall/releases). +First of all, you need to issue the following DDLs to use the NLP module. Note NLP module is not included in `hivemall-with-dependencies.jar`. -> add jar /tmp/[hivemall-nlp-xxx-with-dependencies.jar](https://github.com/myui/hivemall/releases); +> add jar /path/to/hivemall-nlp-xxx-with-dependencies.jar; -> source /tmp/[define-additional.hive](https://github.com/myui/hivemall/releases); +> source /path/to/define-additional.hive; ## Japanese Tokenizer Japanese text tokenizer UDF uses [Kuromoji](https://github.com/atilika/kuromoji). The signature of the UDF is as follows: + ```sql -tokenize_ja(text input, optional const text mode = "normal", optional const array<string> stopWords, optional const array<string> stopTags) +tokenize_ja(text input, optional const text mode = "normal", optional const array<string> stopWords, const array<string> stopTags, const array<string> userDict) ``` -_Caution: `tokenize_ja` is supported since Hivemall v0.4.1 and later._ -It's basic usage is as follows: +> #### Note +> `tokenize_ja` is supported since Hivemall v0.4.1, and the fifth argument is supported since v0.5-rc.1 and later. + +Its basic usage is as follows: ```sql select tokenize_ja("kuromojiã使ã£ãåãã¡æ¸ãã®ãã¹ãã§ãã第äºå¼æ°ã«ã¯normal/search/extendedãæå®ã§ãã¾ããããã©ã«ãã§ã¯normalã¢ã¼ãã§ãã"); ``` > ["kuromoji","使ã","åãã¡æ¸ã","ãã¹ã","第","äº","弿°","normal","search","extended","æå®","ããã©ã«ã","normal","ã¢ã¼ã"] +In addition, the third and fourth argument respectively allow you to use your own list of stop words and stop tags. For example, the following query simply ignores "kuromoji" (as a stop word) and noun word "åãã¡æ¸ã" (as a stop tag): + +```sql +select tokenize_ja("kuromojiã使ã£ãåãã¡æ¸ãã®ãã¹ãã§ãã", "normal", array("kuromoji"), array("åè©-ä¸è¬")); +``` + +> ["ã","使ã","ã","ã®","ãã¹ã","ã§ã"] + +Moreover, the fifth argument `userDict` enables you to register a user-defined custom dictionary in [Kuromoji official format](https://github.com/atilika/kuromoji/blob/909fd6b32bf4e9dc86b7599de5c9b50ca8f004a1/kuromoji-core/src/test/resources/userdict.txt): + +```sql +select tokenize_ja("æ¥æ¬çµæ¸æ°èï¼é¢è¥¿å½é空港", "normal", null, null, + array( + "æ¥æ¬çµæ¸æ°è,æ¥æ¬ çµæ¸ æ°è,ããã³ ã±ã¤ã¶ã¤ ã·ã³ãã³,ã«ã¹ã¿ã åè©", + "é¢è¥¿å½é空港,é¢è¥¿ å½é 空港,ã«ã³ãµã¤ ã³ã¯ãµã¤ ã¯ã¦ã³ã¦,ãã¹ãåè©" + )); +``` + +> ["æ¥æ¬","çµæ¸","æ°è","é¢è¥¿","å½é","空港"] + +Note that you can pass `null` to each of the third and fourth argument to explicitly use Kuromoji's default stop words and stop tags. + +If you have a large custom dictionary as an external file, `userDict` can also be `const string userDictURL` which indicates URL of the external file on somewhere like Amazon S3: + +```sql +select tokenize_ja("æ¥æ¬çµæ¸æ°èï¼é¢è¥¿å½é空港", "normal", null, null, + "https://raw.githubusercontent.com/atilika/kuromoji/909fd6b32bf4e9dc86b7599de5c9b50ca8f004a1/kuromoji-core/src/test/resources/userdict.txt"); +``` + +> ["æ¥æ¬","çµæ¸","æ°è","é¢è¥¿","å½é","空港"] + For detailed APIs, please refer Javadoc of [JapaneseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-kuromoji/org/apache/lucene/analysis/ja/JapaneseAnalyzer.html) as well. ## Chinese Tokenizer @@ -61,7 +95,7 @@ The signature of the UDF is as follows: tokenize_cn(string line, optional const array<string> stopWords) ``` -It's basic usage is as follows: +Its basic usage is as follows: ```sql select tokenize_cn("Smartcn为Apache2.0åè®®ç弿ºä¸æåè¯ç³»ç»ï¼Javaè¯è¨ç¼åï¼ä¿®æ¹çä¸ç§é¢è®¡ç®æICTCLASåè¯ç³»ç»ã"); ``` http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3cbc6647/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java ---------------------------------------------------------------------- diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java index ea977cc..93fd18c 100644 --- a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java +++ b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java @@ -20,8 +20,14 @@ package hivemall.nlp.tokenizer; import hivemall.utils.hadoop.HiveUtils; import hivemall.utils.io.IOUtils; +import hivemall.utils.io.HttpUtils; +import java.io.InputStream; +import java.io.InputStreamReader; import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.net.HttpURLConnection; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -30,6 +36,7 @@ import java.util.List; import java.util.Set; import javax.annotation.Nonnull; +import javax.annotation.Nullable; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; @@ -44,19 +51,24 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.ja.JapaneseAnalyzer; import org.apache.lucene.analysis.ja.JapaneseTokenizer; import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode; +import org.apache.lucene.analysis.ja.dict.UserDictionary; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.util.CharArraySet; @Description( name = "tokenize_ja", - value = "_FUNC_(String line [, const string mode = \"normal\", const list<string> stopWords, const list<string> stopTags])" + value = "_FUNC_(String line [, const string mode = \"normal\", const array<string> stopWords, const array<string> stopTags, const array<string> userDict (or string userDictURL)])" + " - returns tokenized strings in array<string>") @UDFType(deterministic = true, stateful = false) public final class KuromojiUDF extends GenericUDF { + private static final int CONNECT_TIMEOUT_MS = 10000; // 10 sec + private static final int READ_TIMEOUT_MS = 60000; // 60 sec + private static final long MAX_INPUT_STREAM_SIZE = 32L * 1024L * 1024L; // ~32MB private Mode _mode; - private String[] _stopWordsArray; - private Set<String> _stoptags; + private CharArraySet _stopWords; + private Set<String> _stopTags; + private UserDictionary _userDict; // workaround to avoid org.apache.hive.com.esotericsoftware.kryo.KryoException: java.util.ConcurrentModificationException private transient JapaneseAnalyzer _analyzer; @@ -64,15 +76,18 @@ public final class KuromojiUDF extends GenericUDF { @Override public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { final int arglen = arguments.length; - if (arglen < 1 || arglen > 4) { + if (arglen < 1 || arglen > 5) { throw new UDFArgumentException("Invalid number of arguments for `tokenize_ja`: " + arglen); } this._mode = (arglen >= 2) ? tokenizationMode(arguments[1]) : Mode.NORMAL; - this._stopWordsArray = (arglen >= 3) ? HiveUtils.getConstStringArray(arguments[2]) : null; - this._stoptags = (arglen >= 4) ? stopTags(arguments[3]) + this._stopWords = (arglen >= 3) ? stopWords(arguments[2]) + : JapaneseAnalyzer.getDefaultStopSet(); + this._stopTags = (arglen >= 4) ? stopTags(arguments[3]) : JapaneseAnalyzer.getDefaultStopTags(); + this._userDict = (arglen >= 5) ? userDictionary(arguments[4]) : null; + this._analyzer = null; return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector); @@ -80,11 +95,8 @@ public final class KuromojiUDF extends GenericUDF { @Override public List<Text> evaluate(DeferredObject[] arguments) throws HiveException { - JapaneseAnalyzer analyzer = _analyzer; - if (analyzer == null) { - CharArraySet stopwords = stopWords(_stopWordsArray); - analyzer = new JapaneseAnalyzer(null, _mode, stopwords, _stoptags); - this._analyzer = analyzer; + if (_analyzer == null) { + this._analyzer = new JapaneseAnalyzer(_userDict, _mode, _stopWords, _stopTags); } Object arg0 = arguments[0].get(); @@ -96,12 +108,12 @@ public final class KuromojiUDF extends GenericUDF { final List<Text> results = new ArrayList<Text>(32); TokenStream stream = null; try { - stream = analyzer.tokenStream("", line); + stream = _analyzer.tokenStream("", line); if (stream != null) { analyzeTokens(stream, results); } } catch (IOException e) { - IOUtils.closeQuietly(analyzer); + IOUtils.closeQuietly(_analyzer); throw new HiveException(e); } finally { IOUtils.closeQuietly(stream); @@ -115,7 +127,8 @@ public final class KuromojiUDF extends GenericUDF { } @Nonnull - private static Mode tokenizationMode(@Nonnull ObjectInspector oi) throws UDFArgumentException { + private static Mode tokenizationMode(@Nonnull final ObjectInspector oi) + throws UDFArgumentException { final String arg = HiveUtils.getConstString(oi); if (arg == null) { return Mode.NORMAL; @@ -137,8 +150,12 @@ public final class KuromojiUDF extends GenericUDF { } @Nonnull - private static CharArraySet stopWords(@Nonnull final String[] array) + private static CharArraySet stopWords(@Nonnull final ObjectInspector oi) throws UDFArgumentException { + if (HiveUtils.isVoidOI(oi)) { + return JapaneseAnalyzer.getDefaultStopSet(); + } + final String[] array = HiveUtils.getConstStringArray(oi); if (array == null) { return JapaneseAnalyzer.getDefaultStopSet(); } @@ -152,6 +169,9 @@ public final class KuromojiUDF extends GenericUDF { @Nonnull private static Set<String> stopTags(@Nonnull final ObjectInspector oi) throws UDFArgumentException { + if (HiveUtils.isVoidOI(oi)) { + return JapaneseAnalyzer.getDefaultStopTags(); + } final String[] array = HiveUtils.getConstStringArray(oi); if (array == null) { return JapaneseAnalyzer.getDefaultStopTags(); @@ -170,6 +190,85 @@ public final class KuromojiUDF extends GenericUDF { return results; } + @Nullable + private static UserDictionary userDictionary(@Nonnull final ObjectInspector oi) + throws UDFArgumentException { + if (HiveUtils.isConstListOI(oi)) { + return userDictionary(HiveUtils.getConstStringArray(oi)); + } else if (HiveUtils.isConstString(oi)) { + return userDictionary(HiveUtils.getConstString(oi)); + } else { + throw new UDFArgumentException( + "User dictionary MUST be given as an array of constant string or constant string (URL)"); + } + } + + @Nullable + private static UserDictionary userDictionary(@Nullable final String[] userDictArray) + throws UDFArgumentException { + if (userDictArray == null) { + return null; + } + + final StringBuilder builder = new StringBuilder(); + for (String row : userDictArray) { + builder.append(row).append('\n'); + } + final Reader reader = new StringReader(builder.toString()); + try { + return UserDictionary.open(reader); // return null if empty + } catch (Throwable e) { + throw new UDFArgumentException( + "Failed to create user dictionary based on the given array<string>: " + e); + } + } + + @Nullable + private static UserDictionary userDictionary(@Nullable final String userDictURL) + throws UDFArgumentException { + if (userDictURL == null) { + return null; + } + + final HttpURLConnection conn; + try { + conn = HttpUtils.getHttpURLConnection(userDictURL); + } catch (IllegalArgumentException | IOException e) { + throw new UDFArgumentException("Failed to create HTTP connection to the URL: " + e); + } + + // allow to read as a compressed GZIP file for efficiency + conn.setRequestProperty("Accept-Encoding", "gzip"); + + conn.setConnectTimeout(CONNECT_TIMEOUT_MS); // throw exception from connect() + conn.setReadTimeout(READ_TIMEOUT_MS); // throw exception from getXXX() methods + + final int responseCode; + try { + responseCode = conn.getResponseCode(); + } catch (IOException e) { + throw new UDFArgumentException("Failed to get response code: " + e); + } + if (responseCode != 200) { + throw new UDFArgumentException("Got invalid response code: " + responseCode); + } + + final InputStream is; + try { + is = IOUtils.decodeInputStream(HttpUtils.getLimitedInputStream(conn, + MAX_INPUT_STREAM_SIZE)); + } catch (NullPointerException | IOException e) { + throw new UDFArgumentException("Failed to get input stream from the connection: " + e); + } + + final Reader reader = new InputStreamReader(is); + try { + return UserDictionary.open(reader); // return null if empty + } catch (Throwable e) { + throw new UDFArgumentException("Failed to parse the file in CSV format: " + e); + } + } + private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> results) throws IOException { // instantiate an attribute placeholder once http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3cbc6647/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java ---------------------------------------------------------------------- diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java index 39d4821..afaa485 100644 --- a/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java +++ b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java @@ -110,8 +110,7 @@ public final class SmartcnUDF extends GenericUDF { if (array.length == 0) { return CharArraySet.EMPTY_SET; } - CharArraySet results = new CharArraySet(Arrays.asList(array), /* ignoreCase */ - true); + CharArraySet results = new CharArraySet(Arrays.asList(array), true /* ignoreCase */); return results; } http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3cbc6647/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java ---------------------------------------------------------------------- diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java index 7bbaed7..f9acc82 100644 --- a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java +++ b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java @@ -19,6 +19,7 @@ package hivemall.nlp.tokenizer; import java.io.IOException; +import java.util.ArrayList; import java.util.List; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; @@ -133,6 +134,54 @@ public class KuromojiUDFTest { } @Test + public void testFiveArgumentArray() throws UDFArgumentException, IOException { + GenericUDF udf = new KuromojiUDF(); + ObjectInspector[] argOIs = new ObjectInspector[5]; + // line + argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + // mode + PrimitiveTypeInfo stringType = new PrimitiveTypeInfo(); + stringType.setTypeName("string"); + argOIs[1] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector( + stringType, null); + // stopWords + argOIs[2] = ObjectInspectorFactory.getStandardConstantListObjectInspector( + PrimitiveObjectInspectorFactory.javaStringObjectInspector, null); + // stopTags + argOIs[3] = ObjectInspectorFactory.getStandardConstantListObjectInspector( + PrimitiveObjectInspectorFactory.javaStringObjectInspector, null); + // userDictUrl + argOIs[4] = ObjectInspectorFactory.getStandardConstantListObjectInspector( + PrimitiveObjectInspectorFactory.javaStringObjectInspector, null); + udf.initialize(argOIs); + udf.close(); + } + + @Test + public void testFiveArgumenString() throws UDFArgumentException, IOException { + GenericUDF udf = new KuromojiUDF(); + ObjectInspector[] argOIs = new ObjectInspector[5]; + // line + argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + // mode + PrimitiveTypeInfo stringType = new PrimitiveTypeInfo(); + stringType.setTypeName("string"); + argOIs[1] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector( + stringType, null); + // stopWords + argOIs[2] = ObjectInspectorFactory.getStandardConstantListObjectInspector( + PrimitiveObjectInspectorFactory.javaStringObjectInspector, null); + // stopTags + argOIs[3] = ObjectInspectorFactory.getStandardConstantListObjectInspector( + PrimitiveObjectInspectorFactory.javaStringObjectInspector, null); + // userDictUrl + argOIs[4] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector( + stringType, null); + udf.initialize(argOIs); + udf.close(); + } + + @Test public void testEvaluateOneRow() throws IOException, HiveException { KuromojiUDF udf = new KuromojiUDF(); ObjectInspector[] argOIs = new ObjectInspector[1]; @@ -192,6 +241,130 @@ public class KuromojiUDFTest { } @Test + public void testEvaluateUserDictArray() throws IOException, HiveException { + KuromojiUDF udf = new KuromojiUDF(); + ObjectInspector[] argOIs = new ObjectInspector[5]; + // line + argOIs[0] = PrimitiveObjectInspectorFactory.writableStringObjectInspector; + // mode + PrimitiveTypeInfo stringType = new PrimitiveTypeInfo(); + stringType.setTypeName("string"); + argOIs[1] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector( + stringType, null); + // stopWords + argOIs[2] = ObjectInspectorFactory.getStandardConstantListObjectInspector( + PrimitiveObjectInspectorFactory.writableStringObjectInspector, null); + // stopTags + argOIs[3] = ObjectInspectorFactory.getStandardConstantListObjectInspector( + PrimitiveObjectInspectorFactory.writableStringObjectInspector, null); + // userDictArray (from https://raw.githubusercontent.com/atilika/kuromoji/909fd6b32bf4e9dc86b7599de5c9b50ca8f004a1/kuromoji-core/src/test/resources/userdict.txt) + List<String> userDict = new ArrayList<String>(); + userDict.add("æ¥æ¬çµæ¸æ°è,æ¥æ¬ çµæ¸ æ°è,ããã³ ã±ã¤ã¶ã¤ ã·ã³ãã³,ã«ã¹ã¿ã åè©"); + userDict.add("é¢è¥¿å½é空港,é¢è¥¿ å½é 空港,ã«ã³ãµã¤ ã³ã¯ãµã¤ ã¯ã¦ã³ã¦,ãã¹ãåè©"); + argOIs[4] = ObjectInspectorFactory.getStandardConstantListObjectInspector( + PrimitiveObjectInspectorFactory.writableStringObjectInspector, userDict); + udf.initialize(argOIs); + + DeferredObject[] args = new DeferredObject[1]; + args[0] = new DeferredObject() { + public Text get() throws HiveException { + return new Text("æ¥æ¬çµæ¸æ°èã"); + } + + @Override + public void prepare(int arg) throws HiveException {} + }; + + List<Text> tokens = udf.evaluate(args); + + Assert.assertNotNull(tokens); + Assert.assertEquals(3, tokens.size()); + + udf.close(); + } + + @Test(expected = UDFArgumentException.class) + public void testEvaluateInvalidUserDictURL() throws IOException, HiveException { + KuromojiUDF udf = new KuromojiUDF(); + ObjectInspector[] argOIs = new ObjectInspector[5]; + // line + argOIs[0] = PrimitiveObjectInspectorFactory.writableStringObjectInspector; + // mode + PrimitiveTypeInfo stringType = new PrimitiveTypeInfo(); + stringType.setTypeName("string"); + argOIs[1] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector( + stringType, null); + // stopWords + argOIs[2] = ObjectInspectorFactory.getStandardConstantListObjectInspector( + PrimitiveObjectInspectorFactory.writableStringObjectInspector, null); + // stopTags + argOIs[3] = ObjectInspectorFactory.getStandardConstantListObjectInspector( + PrimitiveObjectInspectorFactory.writableStringObjectInspector, null); + // userDictUrl + argOIs[4] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector( + stringType, new Text("http://google.com/")); + udf.initialize(argOIs); + + DeferredObject[] args = new DeferredObject[1]; + args[0] = new DeferredObject() { + public Text get() throws HiveException { + return new Text("ã¯ãã¢ã¸ã®JapaneseAnalyzerã使ã£ã¦ã¿ãããã¹ãã"); + } + + @Override + public void prepare(int arg) throws HiveException {} + }; + + List<Text> tokens = udf.evaluate(args); + Assert.assertNotNull(tokens); + + udf.close(); + } + + @Test + public void testEvaluateUserDictURL() throws IOException, HiveException { + KuromojiUDF udf = new KuromojiUDF(); + ObjectInspector[] argOIs = new ObjectInspector[5]; + // line + argOIs[0] = PrimitiveObjectInspectorFactory.writableStringObjectInspector; + // mode + PrimitiveTypeInfo stringType = new PrimitiveTypeInfo(); + stringType.setTypeName("string"); + argOIs[1] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector( + stringType, null); + // stopWords + argOIs[2] = ObjectInspectorFactory.getStandardConstantListObjectInspector( + PrimitiveObjectInspectorFactory.writableStringObjectInspector, null); + // stopTags + argOIs[3] = ObjectInspectorFactory.getStandardConstantListObjectInspector( + PrimitiveObjectInspectorFactory.writableStringObjectInspector, null); + // userDictUrl (Kuromoji official sample user defined dict on GitHub) + // e.g., "æ¥æ¬çµæ¸æ°è" will be "æ¥æ¬", "çµæ¸", and "æ°è" + argOIs[4] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector( + stringType, + new Text( + "https://raw.githubusercontent.com/atilika/kuromoji/909fd6b32bf4e9dc86b7599de5c9b50ca8f004a1/kuromoji-core/src/test/resources/userdict.txt")); + udf.initialize(argOIs); + + DeferredObject[] args = new DeferredObject[1]; + args[0] = new DeferredObject() { + public Text get() throws HiveException { + return new Text("ã¯ãã¢ã¸ã®JapaneseAnalyzerã使ã£ã¦ã¿ããæ¥æ¬çµæ¸æ°èã"); + } + + @Override + public void prepare(int arg) throws HiveException {} + }; + + List<Text> tokens = udf.evaluate(args); + + Assert.assertNotNull(tokens); + Assert.assertEquals(7, tokens.size()); + + udf.close(); + } + + @Test public void testSerializeByKryo() throws UDFArgumentException { final KuromojiUDF udf = new KuromojiUDF(); ObjectInspector[] argOIs = new ObjectInspector[1];
