This is an automated email from the ASF dual-hosted git repository.
myui pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git
The following commit(s) were added to refs/heads/master by this push:
new b65949a [HIVEMALL-316] Improve error message for duplicate entries
error in Tokenizer user dictionary
b65949a is described below
commit b65949a188d3b60e1411a5de65b5ac2400513173
Author: Makoto Yui <[email protected]>
AuthorDate: Fri Jul 2 15:15:20 2021 +0900
[HIVEMALL-316] Improve error message for duplicate entries error in
Tokenizer user dictionary
## What changes were proposed in this pull request?
Improve error message for duplicate entries error in Tokenizer user
dictionary
## What type of PR is it?
Improvement
## What is the Jira issue?
https://issues.apache.org/jira/browse/HIVEMALL-316
## Checklist
(Please remove this section if not needed; check `x` for YES, blank for NO)
- [x] Did you apply source code formatter, i.e., `./bin/format_code.sh`,
for your commit?
- [ ] Did you run system tests on Hive (or Spark)?
Author: Makoto Yui <[email protected]>
Closes #245 from myui/HIVEMALL-316.
---
.../main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java | 9 +++++++--
nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java | 9 +++++++--
nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java | 10 +++++++---
3 files changed, 21 insertions(+), 7 deletions(-)
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java
b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java
index b41d4dc..39ea743 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java
@@ -383,8 +383,13 @@ public final class KuromojiNEologdUDF extends
UDFWithOptions {
return UserDictionary.open(reader); // return null if empty
} catch (Throwable e) {
throw new UDFArgumentException(
- "Failed to parse the file in CSV format (UTF-8 encoding is
expected): "
- + userDictURL + '\n' +
ExceptionUtils.prettyPrintStackTrace(e));
+ "Failed to parse the dictionary CSV file: " + userDictURL +
'\n'
+ + "Please ensure that \n"
+ + " 1) file encoding is UTF-8, \n"
+ + " 2) no duplicate entry.\"\n"
+ + " 3) the maximum dictionary size is limited to 32MB (SHOULD
be compressed using gzip with .gz suffix)\n"
+ + " 4) read timeout is set to 60 sec and connection must be
established in 10 sec.\n"
+ + ExceptionUtils.prettyPrintStackTrace(e));
}
}
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
index 07059b2..e84488e 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
@@ -383,8 +383,13 @@ public final class KuromojiUDF extends UDFWithOptions {
return UserDictionary.open(reader); // return null if empty
} catch (Throwable e) {
throw new UDFArgumentException(
- "Failed to parse the file in CSV format (UTF-8 encoding is
expected): "
- + userDictURL + '\n' +
ExceptionUtils.prettyPrintStackTrace(e));
+ "Failed to parse the dictionary CSV file: " + userDictURL +
'\n'
+ + "Please ensure that \n"
+ + " 1) file encoding is UTF-8, \n"
+ + " 2) no duplicate entry.\"\n"
+ + " 3) the maximum dictionary size is limited to 32MB (SHOULD
be compressed using gzip with .gz suffix)\n"
+ + " 4) read timeout is set to 60 sec and connection must be
established in 10 sec.\n"
+ + ExceptionUtils.prettyPrintStackTrace(e));
}
}
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java
b/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java
index fb61633..1486b7f 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java
@@ -267,7 +267,6 @@ public final class TokenizeKoUDF extends UDFWithOptions {
return stopTags;
}
-
@Nullable
private static UserDictionary userDictionary(@Nullable final String[]
userDictArray)
throws UDFArgumentException {
@@ -375,8 +374,13 @@ public final class TokenizeKoUDF extends UDFWithOptions {
return UserDictionary.open(reader); // return null if empty
} catch (Throwable e) {
throw new UDFArgumentException(
- "Failed to parse the file in CSV format (UTF-8 encoding is
expected): "
- + userDictURL + '\n' +
ExceptionUtils.prettyPrintStackTrace(e));
+ "Failed to parse the dictionary CSV file: " + userDictURL +
'\n'
+ + "Please ensure that \n"
+ + " 1) file encoding is UTF-8, \n"
+ + " 2) no duplicate entry.\"\n"
+ + " 3) the maximum dictionary size is limited to 32MB (SHOULD
be compressed using gzip with .gz suffix)\n"
+ + " 4) read timeout is set to 60 sec and connection must be
established in 10 sec.\n"
+ + ExceptionUtils.prettyPrintStackTrace(e));
}
}