Repository: incubator-hivemall Updated Branches: refs/heads/v0.5.0 464d08912 -> 3aeed6f75
Explicitly use UTF-8 for Kuromoji dict encoding Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/3aeed6f7 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/3aeed6f7 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/3aeed6f7 Branch: refs/heads/v0.5.0 Commit: 3aeed6f7519da6b336dda82b79437a5a964ba4cd Parents: 464d089 Author: Makoto Yui <[email protected]> Authored: Fri Jan 19 01:34:05 2018 +0900 Committer: Makoto Yui <[email protected]> Committed: Fri Jan 19 01:35:45 2018 +0900 ---------------------------------------------------------------------- .../hivemall/nlp/tokenizer/KuromojiUDF.java | 24 +++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3aeed6f7/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java ---------------------------------------------------------------------- diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java index f56568b..411c89e 100644 --- a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java +++ b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java @@ -18,12 +18,20 @@ */ package hivemall.nlp.tokenizer; +import hivemall.utils.hadoop.HiveUtils; +import hivemall.utils.io.HttpUtils; +import hivemall.utils.io.IOUtils; +import hivemall.utils.lang.ExceptionUtils; + import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.io.StringReader; import java.net.HttpURLConnection; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -51,11 +59,6 @@ import org.apache.lucene.analysis.ja.dict.UserDictionary; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.util.CharArraySet; -import hivemall.utils.hadoop.HiveUtils; -import hivemall.utils.io.HttpUtils; -import hivemall.utils.io.IOUtils; -import hivemall.utils.lang.ExceptionUtils; - @Description(name = "tokenize_ja", value = "_FUNC_(String line [, const string mode = \"normal\", const array<string> stopWords, const array<string> stopTags, const array<string> userDict (or string userDictURL)])" + " - returns tokenized strings in array<string>") @@ -266,12 +269,17 @@ public final class KuromojiUDF extends GenericUDF { + userDictURL + '\n' + ExceptionUtils.prettyPrintStackTrace(e)); } - final Reader reader = new InputStreamReader(is); + CharsetDecoder decoder = + StandardCharsets.UTF_8.newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + final Reader reader = new InputStreamReader(is, decoder); try { return UserDictionary.open(reader); // return null if empty } catch (Throwable e) { - throw new UDFArgumentException("Failed to parse the file in CSV format: " + userDictURL - + '\n' + ExceptionUtils.prettyPrintStackTrace(e)); + throw new UDFArgumentException( + "Failed to parse the file in CSV format (UTF-8 encoding is expected): " + + userDictURL + '\n' + ExceptionUtils.prettyPrintStackTrace(e)); } }
