Repository: incubator-hivemall
Updated Branches:
  refs/heads/v0.5.0 464d08912 -> 3aeed6f75


Explicitly use UTF-8 for Kuromoji dict encoding


Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/3aeed6f7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/3aeed6f7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/3aeed6f7

Branch: refs/heads/v0.5.0
Commit: 3aeed6f7519da6b336dda82b79437a5a964ba4cd
Parents: 464d089
Author: Makoto Yui <[email protected]>
Authored: Fri Jan 19 01:34:05 2018 +0900
Committer: Makoto Yui <[email protected]>
Committed: Fri Jan 19 01:35:45 2018 +0900

----------------------------------------------------------------------
 .../hivemall/nlp/tokenizer/KuromojiUDF.java     | 24 +++++++++++++-------
 1 file changed, 16 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3aeed6f7/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
----------------------------------------------------------------------
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java 
b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
index f56568b..411c89e 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
@@ -18,12 +18,20 @@
  */
 package hivemall.nlp.tokenizer;
 
+import hivemall.utils.hadoop.HiveUtils;
+import hivemall.utils.io.HttpUtils;
+import hivemall.utils.io.IOUtils;
+import hivemall.utils.lang.ExceptionUtils;
+
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.StringReader;
 import java.net.HttpURLConnection;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
@@ -51,11 +59,6 @@ import org.apache.lucene.analysis.ja.dict.UserDictionary;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.util.CharArraySet;
 
-import hivemall.utils.hadoop.HiveUtils;
-import hivemall.utils.io.HttpUtils;
-import hivemall.utils.io.IOUtils;
-import hivemall.utils.lang.ExceptionUtils;
-
 @Description(name = "tokenize_ja",
         value = "_FUNC_(String line [, const string mode = \"normal\", const 
array<string> stopWords, const array<string> stopTags, const array<string> 
userDict (or string userDictURL)])"
                 + " - returns tokenized strings in array<string>")
@@ -266,12 +269,17 @@ public final class KuromojiUDF extends GenericUDF {
                     + userDictURL + '\n' + 
ExceptionUtils.prettyPrintStackTrace(e));
         }
 
-        final Reader reader = new InputStreamReader(is);
+        CharsetDecoder decoder =
+                StandardCharsets.UTF_8.newDecoder()
+                                      
.onMalformedInput(CodingErrorAction.REPORT)
+                                      
.onUnmappableCharacter(CodingErrorAction.REPORT);
+        final Reader reader = new InputStreamReader(is, decoder);
         try {
             return UserDictionary.open(reader); // return null if empty
         } catch (Throwable e) {
-            throw new UDFArgumentException("Failed to parse the file in CSV 
format: " + userDictURL
-                    + '\n' + ExceptionUtils.prettyPrintStackTrace(e));
+            throw new UDFArgumentException(
+                "Failed to parse the file in CSV format (UTF-8 encoding is 
expected): "
+                        + userDictURL + '\n' + 
ExceptionUtils.prettyPrintStackTrace(e));
         }
     }
 

Reply via email to