Thanks, Dave. I think you forgot the default config file? ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Chris Mattmann, Ph.D. Chief Architect Instrument Software and Science Data Systems Section (398) NASA Jet Propulsion Laboratory Pasadena, CA 91109 USA Office: 168-519, Mailstop: 168-527 Email: [email protected] WWW: http://sunset.usc.edu/~mattmann/ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Adjunct Associate Professor, Computer Science Department University of Southern California, Los Angeles, CA 90089 USA ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-----Original Message----- From: "[email protected]" <[email protected]> Reply-To: "[email protected]" <[email protected]> Date: Sunday, November 16, 2014 at 6:37 PM To: "[email protected]" <[email protected]> Subject: svn commit: r1640017 - /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/Tesseract OCRConfig.java >Author: dmeikle >Date: Sun Nov 16 17:37:30 2014 >New Revision: 1640017 > >URL: http://svn.apache.org/r1640017 >Log: >TIKA-1476 - Updated TesseractOCRConfig to read from property file if >present on classpath > >Modified: > >tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/Tesseract >OCRConfig.java > >Modified: >tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/Tesseract >OCRConfig.java >URL: >http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apa >che/tika/parser/ocr/TesseractOCRConfig.java?rev=1640017&r1=1640016&r2=1640 >017&view=diff >========================================================================== >==== >--- >tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/Tesseract >OCRConfig.java (original) >+++ >tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/Tesseract >OCRConfig.java Sun Nov 16 17:37:30 2014 >@@ -17,7 +17,10 @@ > package org.apache.tika.parser.ocr; > > import java.io.File; >+import java.io.IOException; >+import java.io.InputStream; > import java.io.Serializable; >+import java.util.Properties; > > /** > * Configuration for TesseractOCRParser. >@@ -28,7 +31,11 @@ import java.io.Serializable; > * config.setTesseractPath(tesseractFolder);<br> > * parseContext.set(TesseractOCRConfig.class, config);<br> > * </p> >- * >+ * >+ * Parameters can also be set by creating the >TesseractOCRConfig.properties file >+ * and placing it in the package org/apache/tika/parser/ocr on the >classpath. An >+ * example file can be found in the test resources folder: >+ * ><code>tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-f >ull.properties</code>. > * > */ > public class TesseractOCRConfig implements Serializable{ >@@ -52,7 +59,58 @@ public class TesseractOCRConfig implemen > > // Maximum time (seconds) to wait for the ocring process termination > private int timeout = 120; >- >+ >+ /** >+ * Default contructor. >+ */ >+ public TesseractOCRConfig() { >+ >init(this.getClass().getResourceAsStream("TesseractOCRConfig.properties >")); >+ } >+ >+ /** >+ * Loads properties from InputStream and then tries to close >InputStream. >+ * If there is an IOException, this silently swallows the exception >+ * and goes back to the default. >+ * >+ * @param is >+ */ >+ public TesseractOCRConfig(InputStream is) { >+ init(is); >+ } >+ >+ private void init(InputStream is) { >+ if (is == null) { >+ return; >+ } >+ Properties props = new Properties(); >+ try { >+ props.load(is); >+ } catch (IOException e) { >+ } finally { >+ if (is != null) { >+ try { >+ is.close(); >+ } catch (IOException e) { >+ //swallow >+ } >+ } >+ } >+ >+ setTesseractPath( >+ getProp(props, "tesseractPath", >getTesseractPath())); >+ setLanguage( >+ getProp(props, "language", getLanguage())); >+ setPageSegMode( >+ getProp(props, "pageSegMode", >getPageSegMode())); >+ setMinFileSizeToOcr( >+ getProp(props, "minFileSizeToOcr", >getMinFileSizeToOcr())); >+ setMaxFileSizeToOcr( >+ getProp(props, "maxFileSizeToOcr", >getMaxFileSizeToOcr())); >+ setTimeout( >+ getProp(props, "timeout", getTimeout())); >+ >+ } >+ > /** @see #setTesseractPath(String tesseractPath)*/ > public String getTesseractPath() { > return tesseractPath; >@@ -62,7 +120,7 @@ public class TesseractOCRConfig implemen > * Set tesseract installation folder, needed if it is not on system >path. > */ > public void setTesseractPath(String tesseractPath) { >- if(!tesseractPath.endsWith(File.separator)) >+ if(!tesseractPath.isEmpty() && >!tesseractPath.endsWith(File.separator)) > tesseractPath += File.separator; > > this.tesseractPath = tesseractPath; >@@ -132,5 +190,34 @@ public class TesseractOCRConfig implemen > public int getTimeout() { > return timeout; > } >- >+ >+ /** >+ * Get property from the properties file passed in. >+ * @param properties properties file to read from. >+ * @param property the property to fetch. >+ * @param defaultMissing default parameter to use. >+ * @return the value. >+ */ >+ private int getProp(Properties properties, String property, int >defaultMissing) { >+ String p = properties.getProperty(property); >+ if (p == null || p.isEmpty()){ >+ return defaultMissing; >+ } >+ try { >+ return Integer.parseInt(p); >+ } catch (Throwable ex) { >+ throw new RuntimeException(String.format("Cannot parse >TesseractOCRConfig variable $s, invalid integer value", property), ex); >+ } >+ } >+ >+ /** >+ * Get property from the properties file passed in. >+ * @param properties properties file to read from. >+ * @param property the property to fetch. >+ * @param defaultMissing default parameter to use. >+ * @return the value. >+ */ >+ private String getProp(Properties properties, String property, String >defaultMissing) { >+ return properties.getProperty(property, defaultMissing); >+ } > } > >
