tballison commented on a change in pull request #402:
URL: https://github.com/apache/tika/pull/402#discussion_r570380074
##########
File path:
tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
##########
@@ -249,17 +255,71 @@ public String getLanguage() {
/**
* Set tesseract language dictionary to be used. Default is "eng".
+ * languages are either:
+ * <ol>
+ * <li>Nominally an ISO-639-2 code but compound codes are allowed
separated by underscore: e.g., chi_tra_vert, aze_cyrl</li>
+ * <li>A file path in the script directory. The name starts with
upper-case letter.
+ * Some of them have underscores and other upper-case letters: e.g.,
script/Arabic, script/HanS_vert, script/Japanese_vert,
script/Canadian_Aboriginal</li>
+ * </ol>
* Multiple languages may be specified, separated by plus characters.
- * e.g. "chi_tra+chi_sim"
+ * e.g. "chi_tra+chi_sim+script/Arabic"
*/
public void setLanguage(String language) {
- if (!language.matches("([a-zA-Z]{3}(_[a-zA-Z]{3,4}){0,2}(\\+?))+")
- || language.endsWith("+")) {
- throw new IllegalArgumentException("Invalid language code:
"+language);
+ // Get rid of embedded spaces
+ language = language.replaceAll("\\s", "");
+ // Test for leading or trailing +
+ if (language.matches("\\+.*|.*\\+")) {
+ throw new IllegalArgumentException("Invalid syntax - Can't start
or end with +" + language);
+ }
+ // Split on the + sign
+ final String[] langs = language.split("\\+");
+ List<String> invalidCodes = new ArrayList<>();
+ for (String lang : langs) {
+ // First, make sure it conforms to the correct syntax
+ if
(!lang.matches("([a-zA-Z]{3}(_[a-zA-Z]{3,4}){0,2})|script(/|\\\\)[A-Z][a-zA-Z_]+"))
{
+ invalidCodes.add(lang + " (invalid syntax)");
+ } else if (!langExists(lang)) {
+ invalidCodes.add(lang + " (not found)");
+ }
+ }
+ if (!invalidCodes.isEmpty()) {
+ throw new IllegalArgumentException("Invalid language code(s): " +
invalidCodes);
}
this.language = language;
}
+
+ /**
+ * Check if tessdata language model exists
+ */
+ private boolean langExists(String lang) {
Review comment:
Yes.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]