Repository: tika
Updated Branches:
  refs/heads/master 95b2cd127 -> 6f16480f7


fix for TIKA-2021 contributed by Zarana Parekh


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/48b27d21
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/48b27d21
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/48b27d21

Branch: refs/heads/master
Commit: 48b27d219f791ee14f1e0ffa18e4e80583f3df54
Parents: 2031de7
Author: Zarana Parekh <[email protected]>
Authored: Fri Jun 24 18:53:00 2016 -0700
Committer: Zarana Parekh <[email protected]>
Committed: Fri Jun 24 18:53:00 2016 -0700

----------------------------------------------------------------------
 tika-bundle/pom.xml                             |   2 +
 tika-parsers/pom.xml                            |  11 +-
 .../tika/parser/ocr/TesseractOCRConfig.java     | 158 ++++++++++++++++++-
 .../tika/parser/ocr/TesseractOCRParser.java     | 110 ++++++++++++-
 .../parser/ocr/TesseractOCRConfig.properties    |  11 +-
 .../org/apache/tika/parser/ocr/rotation.py      |  72 +++++++++
 6 files changed, 359 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/48b27d21/tika-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-bundle/pom.xml b/tika-bundle/pom.xml
index ca1d6f2..7fb5c8d 100644
--- a/tika-bundle/pom.xml
+++ b/tika-bundle/pom.xml
@@ -112,6 +112,7 @@
   </dependencies>
 
   <build>
+       <pluginManagement>
     <plugins>
       <plugin>
         <groupId>org.apache.felix</groupId>
@@ -426,6 +427,7 @@
         </configuration>
       </plugin>
     </plugins>
+    </pluginManagement>
   </build>
 
   <organization>

http://git-wip-us.apache.org/repos/asf/tika/blob/48b27d21/tika-parsers/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index a126eed..cab385e 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -49,6 +49,7 @@
     <sis.version>0.6</sis.version>
     <!-- used by POI, PDFBox and Jackcess ...try to sync -->
     <bouncycastle.version>1.54</bouncycastle.version>
+    <commonsexec.version>1.3</commonsexec.version>
   </properties>
 
   <dependencies>
@@ -256,7 +257,13 @@
       <artifactId>cxf-rt-rs-client</artifactId>
       <version>${cxf.version}</version>
     </dependency>
-       
+    <!-- TIKA-2021: Tesseract OCR Parser dependencies, 
+    used for executing image processing script -->
+       <dependency>
+           <groupId>org.apache.commons</groupId>
+           <artifactId>commons-exec</artifactId>
+           <version>${commonsexec.version}</version>
+       </dependency>
 
     <!-- Provided dependencies -->
     <dependency>
@@ -502,6 +509,7 @@
         </file>
       </activation>
       <build>
+      <pluginManagement>
         <plugins>
           <plugin>
             <groupId>org.codehaus.gmaven</groupId>
@@ -532,6 +540,7 @@
             </executions>
           </plugin>
         </plugins>
+        </pluginManagement>
       </build>
     </profile>
   </profiles>

http://git-wip-us.apache.org/repos/asf/tika/blob/48b27d21/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index a35370a..d660142 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -63,6 +63,24 @@ public class TesseractOCRConfig implements Serializable{
        // Maximum time (seconds) to wait for the ocring process termination
        private int timeout = 120;
 
+       // Path to ImageMagick program, if not on system path.
+       private String ImageMagickPath = "";
+       
+       // resolution of processed image (in dpi).
+       private int density = 300;
+       
+       // number of bits in a color sample within a pixel.
+       private int depth = 4;
+       
+       // colorspace of processed image.
+       private String colorspace = "gray";
+       
+       // filter to be applied to the processed image.
+       private String filter = "triangle";
+       
+       // factor by which image is to be scaled.
+       private int resize = 900;
+       
        /**
         * Default contructor.
         */
@@ -99,6 +117,7 @@ public class TesseractOCRConfig implements Serializable{
                        }
                }
 
+               // set parameters for Tesseract
                setTesseractPath(
                                getProp(props, "tesseractPath", 
getTesseractPath()));
         setTessdataPath(
@@ -113,9 +132,23 @@ public class TesseractOCRConfig implements Serializable{
                                getProp(props, "maxFileSizeToOcr", 
getMaxFileSizeToOcr()));
                setTimeout(
                 getProp(props, "timeout", getTimeout()));
+               
+               // set parameters for ImageMagick
+               setImageMagickPath(
+                               getProp(props, "ImageMagickPath", 
getImageMagickPath()));
+               setDensity(
+                               getProp(props, "density", getDensity()));
+               setDepth(
+                               getProp(props, "depth", getDepth()));
+               setColorspace(
+                               getProp(props, "colorspace", getColorspace()));
+               setFilter(
+                               getProp(props, "filter", getFilter()));
+               setResize(
+                               getProp(props, "resize", getResize()));
 
        }
-
+       
        /** @see #setTesseractPath(String tesseractPath)*/
        public String getTesseractPath() {
                return tesseractPath;
@@ -222,8 +255,130 @@ public class TesseractOCRConfig implements Serializable{
        public int getTimeout() {
                return timeout;
        }
+       
+       /**
+        * @return the density
+        */
+       public int getDensity() {
+               return density;
+       }
+
+       /**
+        * @param density the density to set
+        * Default value is 300.
+        */
+       public void setDensity(int density) {
+               if(density < 150 || density > 1200) {
+                       throw new IllegalArgumentException("Invalid density 
value");
+               }
+               this.density = density;
+       }
 
        /**
+        * @return the depth
+        */
+       public int getDepth() {
+               return depth;
+       }
+
+       /**
+        * @param depth the depth to set
+        * Default value is 4.
+        */
+       public void setDepth(int depth) {
+               int[] allowedValues = {2, 4, 8, 16, 32, 64, 256, 4096};
+               for (int i = 0; i < allowedValues.length; i++) {
+                       if(depth == allowedValues[i]) {
+                               this.depth = depth;
+                               return;
+                       }
+               }
+               throw new IllegalArgumentException("Invalid depth value");
+       }
+
+       /**
+        * @return the colorspace
+        */
+       public String getColorspace() {
+               return colorspace;
+       }
+
+       /**
+        * @param colorspace the colorspace to set
+        * Deafult value is gray.
+        */
+       public void setColorspace(String colorspace) {
+               if(!colorspace.equals(null)) {
+                       this.colorspace = colorspace;
+               } else {
+                       throw new IllegalArgumentException("Invalid colorspace 
value");
+               }
+       }
+
+       /**
+        * @return the filter
+        */
+       public String getFilter() {
+               return filter;
+       }
+
+       /**
+        * @param filter the filter to set
+        * Default value is triangle.
+        */
+       public void setFilter(String filter) {
+               if(filter.equals(null)) {
+                       throw new IllegalArgumentException("Invalid filter 
value");
+               }
+               
+               String[] allowedFilters = {"Point", "Hermite", "Cubic", "Box", 
"Gaussian", "Catrom", "Triangle", "Quadratic", "Mitchell"};
+               for (int i = 0; i < allowedFilters.length; i++) {
+                       if(filter.equalsIgnoreCase(allowedFilters[i])) {
+                               this.filter = filter;
+                               return;
+                       }
+               }
+               throw new IllegalArgumentException("Invalid filter value");
+       }
+
+       /**
+        * @return the resize
+        */
+       public int getResize() {
+               return resize;
+       }
+
+       /**
+        * @param resize the resize to set
+        * Default value is 900.
+        */
+       public void setResize(int resize) {
+               for(int i=1;i<10;i++) {
+                       if(resize == i*100) {
+                               this.resize = resize;
+                               return;
+                       }
+               }
+               throw new IllegalArgumentException("Invalid resize value");
+       }
+
+       /** @see #setImageMagickPath(String ImageMagickPath)*/
+       public String getImageMagickPath() {
+               
+               return ImageMagickPath;
+       }
+       
+       /**
+        * Set the path to the ImageMagick executable, needed if it is not on 
system path.
+        */
+       public void setImageMagickPath(String ImageMagickPath) {
+               if(!ImageMagickPath.isEmpty() && 
!ImageMagickPath.endsWith(File.separator))
+                       ImageMagickPath += File.separator;
+               
+               this.ImageMagickPath = ImageMagickPath;
+       }
+       
+       /**
         * Get property from the properties file passed in.
         * @param properties properties file to read from.
         * @param property the property to fetch.
@@ -253,4 +408,5 @@ public class TesseractOCRConfig implements Serializable{
        private String getProp(Properties properties, String property, String 
defaultMissing) {
                return properties.getProperty(property, defaultMissing);
        }
+       
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/48b27d21/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index a238a7c..1280aec 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -20,6 +20,8 @@ import javax.imageio.ImageIO;
 
 import java.awt.Image;
 import java.awt.image.BufferedImage;
+import java.io.BufferedReader;
+import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
@@ -27,6 +29,9 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.nio.file.StandardCopyOption;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
@@ -40,6 +45,10 @@ import java.util.concurrent.FutureTask;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 
+import org.apache.commons.exec.CommandLine;
+import org.apache.commons.exec.DefaultExecutor;
+import org.apache.commons.exec.PumpStreamHandler;
+import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.logging.LogFactory;
 import org.apache.tika.exception.TikaException;
@@ -127,7 +136,43 @@ public class TesseractOCRParser extends AbstractParser {
         return hasTesseract;
      
     }
+    
+    public boolean hasImageMagick(TesseractOCRConfig config) {
+        // Fetch where the config says to find ImageMagick Program
+        String ImageMagick = config.getImageMagickPath() + 
getImageMagickProg();
+
+        // Have we already checked for a copy of ImageMagick Program there?
+        if (TESSERACT_PRESENT.containsKey(ImageMagick)) {
+            return TESSERACT_PRESENT.get(ImageMagick);
+        }
 
+        // Try running ImageMagick program from there, and see if it exists + 
works
+        String[] checkCmd = { ImageMagick };
+        boolean hasImageMagick = ExternalParser.check(checkCmd);
+        TESSERACT_PRESENT.put(ImageMagick, hasImageMagick);
+        
+        return hasImageMagick;
+     
+    }
+    
+    public boolean hasPython() {
+       // check if python is installed and if the rotation program path has 
been specified correctly
+        
+       boolean hasPython = false;
+       
+       try {
+                       Process proc = Runtime.getRuntime().exec("python -h");
+                       BufferedReader stdInput = new BufferedReader(new 
InputStreamReader(proc.getInputStream()));
+                       if(stdInput.read() != -1) {
+                               hasPython = true;
+                       }
+               } catch (IOException e) {
+                       e.printStackTrace();
+               } 
+       
+               return hasPython;       
+    }
+    
     public void parse(Image image, ContentHandler handler, Metadata metadata, 
ParseContext context) throws IOException,
             SAXException, TikaException {
 
@@ -212,6 +257,52 @@ public class TesseractOCRParser extends AbstractParser {
 
     }
 
+    /**
+     * This method is used to process the image to an OCR-friendly format.
+     * @param streamingObject input image to be processed
+     * @param config TesseractOCRconfig class to get ImageMagick properties
+     * @throws IOException
+     * @throws TikaException
+     */
+    private void processImage(File streamingObject, TesseractOCRConfig config) 
throws IOException, TikaException {
+       
+       // fetch rotation script from resources
+       InputStream in = getClass().getResourceAsStream("rotation.py");
+       TemporaryResources tmp = new TemporaryResources();
+       File rotationScript = tmp.createTemporaryFile();
+       Files.copy(in, rotationScript.toPath(), 
StandardCopyOption.REPLACE_EXISTING);
+       
+       String cmd = "python " + rotationScript.getAbsolutePath() + " -f " + 
streamingObject.getAbsolutePath();
+       String angle = "0"; 
+                       
+       DefaultExecutor executor = new DefaultExecutor();
+       ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+       PumpStreamHandler streamHandler = new PumpStreamHandler(outputStream);
+        executor.setStreamHandler(streamHandler);
+        
+        // determine the angle of rotation required to make the text horizontal
+        CommandLine cmdLine = CommandLine.parse(cmd);
+        if(hasPython()) {
+               try {
+                       executor.execute(cmdLine);
+                       angle = outputStream.toString().trim();
+            } catch(Exception e) {     
+                       e.printStackTrace();
+               }
+        }
+              
+        // process the image - parameter values can be set in 
TesseractOCRConfig.properties
+       String line = "convert -density " + config.getDensity() + " -depth " + 
config.getDepth() + " -colorspace " + config.getColorspace() +  " -filter " + 
config.getFilter() + " -resize " + config.getResize() + "% -rotate "+ angle + " 
" + streamingObject.getAbsolutePath() + " " + 
streamingObject.getAbsolutePath();           
+        cmdLine = CommandLine.parse(line);
+        try {
+               executor.execute(cmdLine);
+        } catch(Exception e) { 
+               e.printStackTrace();
+       } 
+       
+        tmp.close();
+    }
+    
     private void parse(TikaInputStream tikaInputStream, File tmpImgFile, 
XHTMLContentHandler xhtml, TesseractOCRConfig config)
             throws IOException, SAXException, TikaException {
         File tmpTxtOutput = null;
@@ -222,7 +313,18 @@ public class TesseractOCRParser extends AbstractParser {
 
             if (size >= config.getMinFileSizeToOcr() && size <= 
config.getMaxFileSizeToOcr()) {
 
-                doOCR(input, tmpImgFile, config);
+               // copy the contents of the original input file into a 
temporary file
+               // which will be processed for OCR
+               TemporaryResources tmp = new TemporaryResources();
+               File tmpFile = tmp.createTemporaryFile();
+               FileUtils.copyFile(input, tmpFile);
+               
+               // Process image if ImageMagick Tool is present
+               if(hasImageMagick(config)) {
+                       processImage(tmpFile,config);
+               }
+               
+                doOCR(tmpFile, tmpImgFile, config);                
 
                 // Tesseract appends .txt to output file name
                 tmpTxtOutput = new File(tmpImgFile.getAbsolutePath() + ".txt");
@@ -232,7 +334,8 @@ public class TesseractOCRParser extends AbstractParser {
                         extractOutput(is, xhtml);
                     }
                 }
-
+                
+                tmp.close();
             }
 
         } finally {
@@ -369,4 +472,7 @@ public class TesseractOCRParser extends AbstractParser {
         return System.getProperty("os.name").startsWith("Windows") ? 
"tesseract.exe" : "tesseract";
     }
 
+    static String getImageMagickProg() {
+       return System.getProperty("os.name").startsWith("Windows") ? 
"convert.exe" : "convert";
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/48b27d21/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
 
b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
index cb2151c..a0a0b54 100644
--- 
a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
+++ 
b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
@@ -13,9 +13,18 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+# Tesseract properties
 tesseractPath=
 language=eng
 pageSegMode=1
 maxFileSizeToOcr=2147483647
 minFileSizeToOcr=0
-timeout=120
\ No newline at end of file
+timeout=120
+
+# properties for image processing
+ImageMagickPath=
+density=300
+depth=4
+colorspace=gray
+filter=triangle
+resize=900
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/48b27d21/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/rotation.py
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/rotation.py 
b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/rotation.py
new file mode 100644
index 0000000..b24fabf
--- /dev/null
+++ b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/rotation.py
@@ -0,0 +1,72 @@
+"""
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from __future__ import division, print_function
+from skimage.transform import radon
+from PIL import Image
+from numpy import asarray, mean, array, blackman
+import numpy
+from numpy.fft import rfft
+import matplotlib.pyplot as plt
+from matplotlib.mlab import rms_flat
+
+import sys
+import getopt
+
+def main(argv):
+       filename = ''
+       
+       if len(sys.argv) < 3:
+               print('Usage: rotation_spacing.py -f <filename>')
+               sys.exit()
+       try:
+         opts, args = getopt.getopt(argv,"hf:",["file="])
+       except getopt.GetoptError:
+         print('rotation_spacing.py -f <filename>')
+         sys.exit(2)
+       for opt, arg in opts:
+         if opt == '-h':
+            print('Usage: rotation_spacing.py -f <filename>')
+            sys.exit()
+         elif opt in ("-f", "--file"):
+            filename = arg
+
+       try:
+               from parabolic import parabolic
+
+               def argmax(x):
+                   return parabolic(x, numpy.argmax(x))[0]
+       except ImportError:
+               from numpy import argmax
+
+       # Load file, converting to grayscale
+       I = asarray(Image.open(filename).convert('L'))
+       I = I - mean(I)  # Demean; make the brightness extend above and below 
zero
+
+       # Do the radon transform and display the result
+       sinogram = radon(I)
+
+       # Find the RMS value of each row and find "busiest" rotation,
+       # where the transform is lined up perfectly with the alternating dark
+       # text and white lines
+       r = array([rms_flat(line) for line in sinogram.transpose()])
+       rotation = argmax(r)
+
+       print('{:.2f}'.format(-(90-rotation)))
+
+if __name__ == "__main__":
+       main(sys.argv[1:])
\ No newline at end of file

Reply via email to