Author: mattmann
Date: Wed Aug  5 01:33:13 2015
New Revision: 1694133

URL: http://svn.apache.org/r1694133
Log:
Fix for TIKA-1703: Can't Specify Tesseract Data Folder Distinct from Tesseract 
Executable Path Contributed by Christian Wolfe <[email protected]> this closes 
#56.

Modified:
    tika/trunk/CHANGES.txt
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
    
tika/trunk/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties

Modified: tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1694133&r1=1694132&r2=1694133&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Wed Aug  5 01:33:13 2015
@@ -1,3 +1,9 @@
+Release 1.11 - Current Development
+
+  * The ability to specify the Tesseract Config Path was added
+    to the OCR Parser (TIKA-1703).
+
+
 Release 1.10 - 8/1/2015
 
   * Tika Config XML can now be used to create composite detectors,

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java?rev=1694133&r1=1694132&r2=1694133&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
 Wed Aug  5 01:33:13 2015
@@ -25,7 +25,7 @@ import java.util.Properties;
 
 /**
  * Configuration for TesseractOCRParser.
- * 
+ *
  * This allows to enable TesseractOCRParser and set its parameters:
  * <p>
  * TesseractOCRConfig config = new TesseractOCRConfig();<br>
@@ -36,27 +36,30 @@ import java.util.Properties;
  * Parameters can also be set by either editing the existing 
TesseractOCRConfig.properties file in,
  * tika-parser/src/main/resources/org/apache/tika/parser/ocr, or overriding it 
by creating your own
  * and placing it in the package org/apache/tika/parser/ocr on the classpath.
- * 
+ *
  */
 public class TesseractOCRConfig implements Serializable{
 
        private static final long serialVersionUID = -4861942486845757891L;
-       
+
        // Path to tesseract installation folder, if not on system path.
        private  String tesseractPath = "";
-       
+
+    // Path to the 'tessdata' folder, which contains language files and config 
files.
+    private String tessdataPath = "";
+
        // Language dictionary to be used.
        private  String language = "eng";
-       
+
        // Tesseract page segmentation mode.
        private  String pageSegMode = "1";
-       
+
        // Minimum file size to submit file to ocr.
        private  int minFileSizeToOcr = 0;
-       
+
        // Maximum file size to submit file to ocr.
        private  int maxFileSizeToOcr = Integer.MAX_VALUE;
-       
+
        // Maximum time (seconds) to wait for the ocring process termination
        private int timeout = 120;
 
@@ -98,6 +101,8 @@ public class TesseractOCRConfig implemen
 
                setTesseractPath(
                                getProp(props, "tesseractPath", 
getTesseractPath()));
+        setTessdataPath(
+                getProp(props, "tessdataPath", getTessdataPath()));
                setLanguage(
                                getProp(props, "language", getLanguage()));
                setPageSegMode(
@@ -107,7 +112,7 @@ public class TesseractOCRConfig implemen
                setMaxFileSizeToOcr(
                                getProp(props, "maxFileSizeToOcr", 
getMaxFileSizeToOcr()));
                setTimeout(
-                               getProp(props, "timeout", getTimeout()));
+                getProp(props, "timeout", getTimeout()));
 
        }
 
@@ -115,22 +120,43 @@ public class TesseractOCRConfig implemen
        public String getTesseractPath() {
                return tesseractPath;
        }
-       
+
        /**
-        * Set tesseract installation folder, needed if it is not on system 
path.
+        * Set the path to the Tesseract executable, needed if it is not on 
system path.
+     * <p>
+     * Note that if you set this value, it is highly recommended that you also
+     * set the path to the 'tessdata' folder using {@link #setTessdataPath}.
+     * </p>
         */
        public void setTesseractPath(String tesseractPath) {
                if(!tesseractPath.isEmpty() && 
!tesseractPath.endsWith(File.separator))
                        tesseractPath += File.separator;
-               
+
                this.tesseractPath = tesseractPath;
        }
-       
+
+    /** @see #setTessdataPath(String tessdataPath) */
+    public String getTessdataPath() {
+        return tessdataPath;
+    }
+
+    /**
+     * Set the path to the 'tessdata' folder, which contains language files 
and config files. In some cases (such
+     * as on Windows), this folder is found in the Tesseract installation, but 
in other cases
+     * (such as when Tesseract is built from source), it may be located 
elsewhere.
+     */
+    public void setTessdataPath(String tessdataPath) {
+        if(!tessdataPath.isEmpty() && !tessdataPath.endsWith(File.separator))
+            tessdataPath += File.separator;
+
+        this.tessdataPath = tessdataPath;
+    }
+
        /** @see #setLanguage(String language)*/
        public String getLanguage() {
                return language;
        }
-       
+
        /**
         * Set tesseract language dictionary to be used. Default is "eng".
         * Multiple languages may be specified, separated by plus characters.
@@ -141,12 +167,12 @@ public class TesseractOCRConfig implemen
                }
                this.language = language;
        }
-       
+
        /** @see #setPageSegMode(String pageSegMode)*/
        public String getPageSegMode() {
                return pageSegMode;
        }
-       
+
        /**
         * Set tesseract page segmentation mode.
         * Default is 1 = Automatic page segmentation with OSD (Orientation and 
Script Detection)
@@ -157,12 +183,12 @@ public class TesseractOCRConfig implemen
                }
                this.pageSegMode = pageSegMode;
        }
-       
+
        /** @see #setMinFileSizeToOcr(int minFileSizeToOcr)*/
        public int getMinFileSizeToOcr() {
                return minFileSizeToOcr;
        }
-       
+
        /**
         * Set minimum file size to submit file to ocr.
         * Default is 0.
@@ -170,12 +196,12 @@ public class TesseractOCRConfig implemen
        public void setMinFileSizeToOcr(int minFileSizeToOcr) {
                this.minFileSizeToOcr = minFileSizeToOcr;
        }
-       
+
        /** @see #setMaxFileSizeToOcr(int maxFileSizeToOcr)*/
        public int getMaxFileSizeToOcr() {
                return maxFileSizeToOcr;
        }
-       
+
        /**
         * Set maximum file size to submit file to ocr.
         * Default is Integer.MAX_VALUE.

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java?rev=1694133&r1=1694132&r2=1694133&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
 Wed Aug  5 01:33:13 2015
@@ -97,9 +97,14 @@ public class TesseractOCRParser extends
     }
 
     private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
-        if (!config.getTesseractPath().isEmpty()) {
-            Map<String, String> env = pb.environment();
-            env.put("TESSDATA_PREFIX", config.getTesseractPath());
+        String tessdataPrefix = "TESSDATA_PREFIX";
+        Map<String, String> env = pb.environment();
+
+        if (!config.getTessdataPath().isEmpty()) {
+            env.put(tessdataPrefix, config.getTessdataPath());
+        }
+        else if(!config.getTesseractPath().isEmpty()) {
+            env.put(tessdataPrefix, config.getTesseractPath());
         }
     }
 

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java?rev=1694133&r1=1694132&r2=1694133&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
 Wed Aug  5 01:33:13 2015
@@ -32,6 +32,7 @@ public class TesseractOCRConfigTest exte
     public void testNoConfig() throws Exception {
         TesseractOCRConfig config = new TesseractOCRConfig();
         assertEquals("Invalid default tesseractPath value", "", 
config.getTesseractPath());
+        assertEquals("Invalid default tessdataPath value", "", 
config.getTessdataPath());
         assertEquals("Invalid default language value", "eng", 
config.getLanguage());
         assertEquals("Invalid default pageSegMode value", "1", 
config.getPageSegMode());
         assertEquals("Invalid default minFileSizeToOcr value", 0, 
config.getMinFileSizeToOcr());
@@ -47,6 +48,7 @@ public class TesseractOCRConfigTest exte
 
         TesseractOCRConfig config = new TesseractOCRConfig(stream);
         assertEquals("Invalid default tesseractPath value", "", 
config.getTesseractPath());
+        assertEquals("Invalid default tessdataPath value", "", 
config.getTessdataPath());
         assertEquals("Invalid overridden language value", "fra+deu", 
config.getLanguage());
         assertEquals("Invalid default pageSegMode value", "1", 
config.getPageSegMode());
         assertEquals("Invalid overridden minFileSizeToOcr value", 1, 
config.getMinFileSizeToOcr());
@@ -62,6 +64,7 @@ public class TesseractOCRConfigTest exte
 
         TesseractOCRConfig config = new TesseractOCRConfig(stream);
         assertEquals("Invalid overridden tesseractPath value", 
"/opt/tesseract" + File.separator, config.getTesseractPath());
+        assertEquals("Invalid overridden tesseractPath value", 
"/usr/local/share" + File.separator, config.getTessdataPath());
         assertEquals("Invalid overridden language value", "fra+deu", 
config.getLanguage());
         assertEquals("Invalid overridden pageSegMode value", "2", 
config.getPageSegMode());
         assertEquals("Invalid overridden minFileSizeToOcr value", 1, 
config.getMinFileSizeToOcr());

Modified: 
tika/trunk/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties?rev=1694133&r1=1694132&r2=1694133&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties
 (original)
+++ 
tika/trunk/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties
 Wed Aug  5 01:33:13 2015
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 tesseractPath=/opt/tesseract
+tessdataPath=/usr/local/share
 language=fra+deu
 pageSegMode=2
 maxFileSizeToOcr=2000000


Reply via email to