This is an automated email from the ASF dual-hosted git repository.

myui pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git


The following commit(s) were added to refs/heads/master by this push:
     new b65949a  [HIVEMALL-316] Improve error message for duplicate entries 
error in Tokenizer user dictionary
b65949a is described below

commit b65949a188d3b60e1411a5de65b5ac2400513173
Author: Makoto Yui <[email protected]>
AuthorDate: Fri Jul 2 15:15:20 2021 +0900

    [HIVEMALL-316] Improve error message for duplicate entries error in 
Tokenizer user dictionary
    
    ## What changes were proposed in this pull request?
    
    Improve error message for duplicate entries error in Tokenizer user 
dictionary
    
    ## What type of PR is it?
    
    Improvement
    
    ## What is the Jira issue?
    
    https://issues.apache.org/jira/browse/HIVEMALL-316
    
    ## Checklist
    
    (Please remove this section if not needed; check `x` for YES, blank for NO)
    
    - [x] Did you apply source code formatter, i.e., `./bin/format_code.sh`, 
for your commit?
    - [ ] Did you run system tests on Hive (or Spark)?
    
    Author: Makoto Yui <[email protected]>
    
    Closes #245 from myui/HIVEMALL-316.
---
 .../main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java   |  9 +++++++--
 nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java      |  9 +++++++--
 nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java    | 10 +++++++---
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java 
b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java
index b41d4dc..39ea743 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java
@@ -383,8 +383,13 @@ public final class KuromojiNEologdUDF extends 
UDFWithOptions {
             return UserDictionary.open(reader); // return null if empty
         } catch (Throwable e) {
             throw new UDFArgumentException(
-                "Failed to parse the file in CSV format (UTF-8 encoding is 
expected): "
-                        + userDictURL + '\n' + 
ExceptionUtils.prettyPrintStackTrace(e));
+                "Failed to parse the dictionary CSV file: " + userDictURL + 
'\n'
+                + "Please ensure that \n"
+                + "  1) file encoding is UTF-8, \n"
+                + "  2) no duplicate entry.\"\n"
+                + "  3) the maximum dictionary size is limited to 32MB (SHOULD 
be compressed using gzip with .gz suffix)\n"
+                + "  4) read timeout is set to 60 sec and connection must be 
established in 10 sec.\n"
+                        +  ExceptionUtils.prettyPrintStackTrace(e));
         }
     }
 
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java 
b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
index 07059b2..e84488e 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
@@ -383,8 +383,13 @@ public final class KuromojiUDF extends UDFWithOptions {
             return UserDictionary.open(reader); // return null if empty
         } catch (Throwable e) {
             throw new UDFArgumentException(
-                "Failed to parse the file in CSV format (UTF-8 encoding is 
expected): "
-                        + userDictURL + '\n' + 
ExceptionUtils.prettyPrintStackTrace(e));
+                "Failed to parse the dictionary CSV file: " + userDictURL + 
'\n'
+                + "Please ensure that \n"
+                + "  1) file encoding is UTF-8, \n"
+                + "  2) no duplicate entry.\"\n"
+                + "  3) the maximum dictionary size is limited to 32MB (SHOULD 
be compressed using gzip with .gz suffix)\n"
+                + "  4) read timeout is set to 60 sec and connection must be 
established in 10 sec.\n"
+                        +  ExceptionUtils.prettyPrintStackTrace(e));
         }
     }
 
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java 
b/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java
index fb61633..1486b7f 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java
@@ -267,7 +267,6 @@ public final class TokenizeKoUDF extends UDFWithOptions {
         return stopTags;
     }
 
-
     @Nullable
     private static UserDictionary userDictionary(@Nullable final String[] 
userDictArray)
             throws UDFArgumentException {
@@ -375,8 +374,13 @@ public final class TokenizeKoUDF extends UDFWithOptions {
             return UserDictionary.open(reader); // return null if empty
         } catch (Throwable e) {
             throw new UDFArgumentException(
-                "Failed to parse the file in CSV format (UTF-8 encoding is 
expected): "
-                        + userDictURL + '\n' + 
ExceptionUtils.prettyPrintStackTrace(e));
+                "Failed to parse the dictionary CSV file: " + userDictURL + 
'\n'
+                + "Please ensure that \n"
+                + "  1) file encoding is UTF-8, \n"
+                + "  2) no duplicate entry.\"\n"
+                + "  3) the maximum dictionary size is limited to 32MB (SHOULD 
be compressed using gzip with .gz suffix)\n"
+                + "  4) read timeout is set to 60 sec and connection must be 
established in 10 sec.\n"
+                        +  ExceptionUtils.prettyPrintStackTrace(e));
         }
     }
 

Reply via email to