Repository: tika Updated Branches: refs/heads/master 95b2cd127 -> 6f16480f7
fix for TIKA-2021 contributed by Zarana Parekh Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/48b27d21 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/48b27d21 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/48b27d21 Branch: refs/heads/master Commit: 48b27d219f791ee14f1e0ffa18e4e80583f3df54 Parents: 2031de7 Author: Zarana Parekh <[email protected]> Authored: Fri Jun 24 18:53:00 2016 -0700 Committer: Zarana Parekh <[email protected]> Committed: Fri Jun 24 18:53:00 2016 -0700 ---------------------------------------------------------------------- tika-bundle/pom.xml | 2 + tika-parsers/pom.xml | 11 +- .../tika/parser/ocr/TesseractOCRConfig.java | 158 ++++++++++++++++++- .../tika/parser/ocr/TesseractOCRParser.java | 110 ++++++++++++- .../parser/ocr/TesseractOCRConfig.properties | 11 +- .../org/apache/tika/parser/ocr/rotation.py | 72 +++++++++ 6 files changed, 359 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/48b27d21/tika-bundle/pom.xml ---------------------------------------------------------------------- diff --git a/tika-bundle/pom.xml b/tika-bundle/pom.xml index ca1d6f2..7fb5c8d 100644 --- a/tika-bundle/pom.xml +++ b/tika-bundle/pom.xml @@ -112,6 +112,7 @@ </dependencies> <build> + <pluginManagement> <plugins> <plugin> <groupId>org.apache.felix</groupId> @@ -426,6 +427,7 @@ </configuration> </plugin> </plugins> + </pluginManagement> </build> <organization> http://git-wip-us.apache.org/repos/asf/tika/blob/48b27d21/tika-parsers/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml index a126eed..cab385e 100644 --- a/tika-parsers/pom.xml +++ b/tika-parsers/pom.xml @@ -49,6 +49,7 @@ <sis.version>0.6</sis.version> <!-- used by POI, PDFBox and Jackcess ...try to sync --> <bouncycastle.version>1.54</bouncycastle.version> + <commonsexec.version>1.3</commonsexec.version> </properties> <dependencies> @@ -256,7 +257,13 @@ <artifactId>cxf-rt-rs-client</artifactId> <version>${cxf.version}</version> </dependency> - + <!-- TIKA-2021: Tesseract OCR Parser dependencies, + used for executing image processing script --> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-exec</artifactId> + <version>${commonsexec.version}</version> + </dependency> <!-- Provided dependencies --> <dependency> @@ -502,6 +509,7 @@ </file> </activation> <build> + <pluginManagement> <plugins> <plugin> <groupId>org.codehaus.gmaven</groupId> @@ -532,6 +540,7 @@ </executions> </plugin> </plugins> + </pluginManagement> </build> </profile> </profiles> http://git-wip-us.apache.org/repos/asf/tika/blob/48b27d21/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java index a35370a..d660142 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java @@ -63,6 +63,24 @@ public class TesseractOCRConfig implements Serializable{ // Maximum time (seconds) to wait for the ocring process termination private int timeout = 120; + // Path to ImageMagick program, if not on system path. + private String ImageMagickPath = ""; + + // resolution of processed image (in dpi). + private int density = 300; + + // number of bits in a color sample within a pixel. + private int depth = 4; + + // colorspace of processed image. + private String colorspace = "gray"; + + // filter to be applied to the processed image. + private String filter = "triangle"; + + // factor by which image is to be scaled. + private int resize = 900; + /** * Default contructor. */ @@ -99,6 +117,7 @@ public class TesseractOCRConfig implements Serializable{ } } + // set parameters for Tesseract setTesseractPath( getProp(props, "tesseractPath", getTesseractPath())); setTessdataPath( @@ -113,9 +132,23 @@ public class TesseractOCRConfig implements Serializable{ getProp(props, "maxFileSizeToOcr", getMaxFileSizeToOcr())); setTimeout( getProp(props, "timeout", getTimeout())); + + // set parameters for ImageMagick + setImageMagickPath( + getProp(props, "ImageMagickPath", getImageMagickPath())); + setDensity( + getProp(props, "density", getDensity())); + setDepth( + getProp(props, "depth", getDepth())); + setColorspace( + getProp(props, "colorspace", getColorspace())); + setFilter( + getProp(props, "filter", getFilter())); + setResize( + getProp(props, "resize", getResize())); } - + /** @see #setTesseractPath(String tesseractPath)*/ public String getTesseractPath() { return tesseractPath; @@ -222,8 +255,130 @@ public class TesseractOCRConfig implements Serializable{ public int getTimeout() { return timeout; } + + /** + * @return the density + */ + public int getDensity() { + return density; + } + + /** + * @param density the density to set + * Default value is 300. + */ + public void setDensity(int density) { + if(density < 150 || density > 1200) { + throw new IllegalArgumentException("Invalid density value"); + } + this.density = density; + } /** + * @return the depth + */ + public int getDepth() { + return depth; + } + + /** + * @param depth the depth to set + * Default value is 4. + */ + public void setDepth(int depth) { + int[] allowedValues = {2, 4, 8, 16, 32, 64, 256, 4096}; + for (int i = 0; i < allowedValues.length; i++) { + if(depth == allowedValues[i]) { + this.depth = depth; + return; + } + } + throw new IllegalArgumentException("Invalid depth value"); + } + + /** + * @return the colorspace + */ + public String getColorspace() { + return colorspace; + } + + /** + * @param colorspace the colorspace to set + * Deafult value is gray. + */ + public void setColorspace(String colorspace) { + if(!colorspace.equals(null)) { + this.colorspace = colorspace; + } else { + throw new IllegalArgumentException("Invalid colorspace value"); + } + } + + /** + * @return the filter + */ + public String getFilter() { + return filter; + } + + /** + * @param filter the filter to set + * Default value is triangle. + */ + public void setFilter(String filter) { + if(filter.equals(null)) { + throw new IllegalArgumentException("Invalid filter value"); + } + + String[] allowedFilters = {"Point", "Hermite", "Cubic", "Box", "Gaussian", "Catrom", "Triangle", "Quadratic", "Mitchell"}; + for (int i = 0; i < allowedFilters.length; i++) { + if(filter.equalsIgnoreCase(allowedFilters[i])) { + this.filter = filter; + return; + } + } + throw new IllegalArgumentException("Invalid filter value"); + } + + /** + * @return the resize + */ + public int getResize() { + return resize; + } + + /** + * @param resize the resize to set + * Default value is 900. + */ + public void setResize(int resize) { + for(int i=1;i<10;i++) { + if(resize == i*100) { + this.resize = resize; + return; + } + } + throw new IllegalArgumentException("Invalid resize value"); + } + + /** @see #setImageMagickPath(String ImageMagickPath)*/ + public String getImageMagickPath() { + + return ImageMagickPath; + } + + /** + * Set the path to the ImageMagick executable, needed if it is not on system path. + */ + public void setImageMagickPath(String ImageMagickPath) { + if(!ImageMagickPath.isEmpty() && !ImageMagickPath.endsWith(File.separator)) + ImageMagickPath += File.separator; + + this.ImageMagickPath = ImageMagickPath; + } + + /** * Get property from the properties file passed in. * @param properties properties file to read from. * @param property the property to fetch. @@ -253,4 +408,5 @@ public class TesseractOCRConfig implements Serializable{ private String getProp(Properties properties, String property, String defaultMissing) { return properties.getProperty(property, defaultMissing); } + } http://git-wip-us.apache.org/repos/asf/tika/blob/48b27d21/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java index a238a7c..1280aec 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java @@ -20,6 +20,8 @@ import javax.imageio.ImageIO; import java.awt.Image; import java.awt.image.BufferedImage; +import java.io.BufferedReader; +import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; @@ -27,6 +29,9 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; @@ -40,6 +45,10 @@ import java.util.concurrent.FutureTask; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; +import org.apache.commons.exec.CommandLine; +import org.apache.commons.exec.DefaultExecutor; +import org.apache.commons.exec.PumpStreamHandler; +import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.logging.LogFactory; import org.apache.tika.exception.TikaException; @@ -127,7 +136,43 @@ public class TesseractOCRParser extends AbstractParser { return hasTesseract; } + + public boolean hasImageMagick(TesseractOCRConfig config) { + // Fetch where the config says to find ImageMagick Program + String ImageMagick = config.getImageMagickPath() + getImageMagickProg(); + + // Have we already checked for a copy of ImageMagick Program there? + if (TESSERACT_PRESENT.containsKey(ImageMagick)) { + return TESSERACT_PRESENT.get(ImageMagick); + } + // Try running ImageMagick program from there, and see if it exists + works + String[] checkCmd = { ImageMagick }; + boolean hasImageMagick = ExternalParser.check(checkCmd); + TESSERACT_PRESENT.put(ImageMagick, hasImageMagick); + + return hasImageMagick; + + } + + public boolean hasPython() { + // check if python is installed and if the rotation program path has been specified correctly + + boolean hasPython = false; + + try { + Process proc = Runtime.getRuntime().exec("python -h"); + BufferedReader stdInput = new BufferedReader(new InputStreamReader(proc.getInputStream())); + if(stdInput.read() != -1) { + hasPython = true; + } + } catch (IOException e) { + e.printStackTrace(); + } + + return hasPython; + } + public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { @@ -212,6 +257,52 @@ public class TesseractOCRParser extends AbstractParser { } + /** + * This method is used to process the image to an OCR-friendly format. + * @param streamingObject input image to be processed + * @param config TesseractOCRconfig class to get ImageMagick properties + * @throws IOException + * @throws TikaException + */ + private void processImage(File streamingObject, TesseractOCRConfig config) throws IOException, TikaException { + + // fetch rotation script from resources + InputStream in = getClass().getResourceAsStream("rotation.py"); + TemporaryResources tmp = new TemporaryResources(); + File rotationScript = tmp.createTemporaryFile(); + Files.copy(in, rotationScript.toPath(), StandardCopyOption.REPLACE_EXISTING); + + String cmd = "python " + rotationScript.getAbsolutePath() + " -f " + streamingObject.getAbsolutePath(); + String angle = "0"; + + DefaultExecutor executor = new DefaultExecutor(); + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + PumpStreamHandler streamHandler = new PumpStreamHandler(outputStream); + executor.setStreamHandler(streamHandler); + + // determine the angle of rotation required to make the text horizontal + CommandLine cmdLine = CommandLine.parse(cmd); + if(hasPython()) { + try { + executor.execute(cmdLine); + angle = outputStream.toString().trim(); + } catch(Exception e) { + e.printStackTrace(); + } + } + + // process the image - parameter values can be set in TesseractOCRConfig.properties + String line = "convert -density " + config.getDensity() + " -depth " + config.getDepth() + " -colorspace " + config.getColorspace() + " -filter " + config.getFilter() + " -resize " + config.getResize() + "% -rotate "+ angle + " " + streamingObject.getAbsolutePath() + " " + streamingObject.getAbsolutePath(); + cmdLine = CommandLine.parse(line); + try { + executor.execute(cmdLine); + } catch(Exception e) { + e.printStackTrace(); + } + + tmp.close(); + } + private void parse(TikaInputStream tikaInputStream, File tmpImgFile, XHTMLContentHandler xhtml, TesseractOCRConfig config) throws IOException, SAXException, TikaException { File tmpTxtOutput = null; @@ -222,7 +313,18 @@ public class TesseractOCRParser extends AbstractParser { if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) { - doOCR(input, tmpImgFile, config); + // copy the contents of the original input file into a temporary file + // which will be processed for OCR + TemporaryResources tmp = new TemporaryResources(); + File tmpFile = tmp.createTemporaryFile(); + FileUtils.copyFile(input, tmpFile); + + // Process image if ImageMagick Tool is present + if(hasImageMagick(config)) { + processImage(tmpFile,config); + } + + doOCR(tmpFile, tmpImgFile, config); // Tesseract appends .txt to output file name tmpTxtOutput = new File(tmpImgFile.getAbsolutePath() + ".txt"); @@ -232,7 +334,8 @@ public class TesseractOCRParser extends AbstractParser { extractOutput(is, xhtml); } } - + + tmp.close(); } } finally { @@ -369,4 +472,7 @@ public class TesseractOCRParser extends AbstractParser { return System.getProperty("os.name").startsWith("Windows") ? "tesseract.exe" : "tesseract"; } + static String getImageMagickProg() { + return System.getProperty("os.name").startsWith("Windows") ? "convert.exe" : "convert"; + } } http://git-wip-us.apache.org/repos/asf/tika/blob/48b27d21/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties index cb2151c..a0a0b54 100644 --- a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties +++ b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties @@ -13,9 +13,18 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Tesseract properties tesseractPath= language=eng pageSegMode=1 maxFileSizeToOcr=2147483647 minFileSizeToOcr=0 -timeout=120 \ No newline at end of file +timeout=120 + +# properties for image processing +ImageMagickPath= +density=300 +depth=4 +colorspace=gray +filter=triangle +resize=900 \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/48b27d21/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/rotation.py ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/rotation.py b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/rotation.py new file mode 100644 index 0000000..b24fabf --- /dev/null +++ b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/rotation.py @@ -0,0 +1,72 @@ +""" + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +from __future__ import division, print_function +from skimage.transform import radon +from PIL import Image +from numpy import asarray, mean, array, blackman +import numpy +from numpy.fft import rfft +import matplotlib.pyplot as plt +from matplotlib.mlab import rms_flat + +import sys +import getopt + +def main(argv): + filename = '' + + if len(sys.argv) < 3: + print('Usage: rotation_spacing.py -f <filename>') + sys.exit() + try: + opts, args = getopt.getopt(argv,"hf:",["file="]) + except getopt.GetoptError: + print('rotation_spacing.py -f <filename>') + sys.exit(2) + for opt, arg in opts: + if opt == '-h': + print('Usage: rotation_spacing.py -f <filename>') + sys.exit() + elif opt in ("-f", "--file"): + filename = arg + + try: + from parabolic import parabolic + + def argmax(x): + return parabolic(x, numpy.argmax(x))[0] + except ImportError: + from numpy import argmax + + # Load file, converting to grayscale + I = asarray(Image.open(filename).convert('L')) + I = I - mean(I) # Demean; make the brightness extend above and below zero + + # Do the radon transform and display the result + sinogram = radon(I) + + # Find the RMS value of each row and find "busiest" rotation, + # where the transform is lined up perfectly with the alternating dark + # text and white lines + r = array([rms_flat(line) for line in sinogram.transpose()]) + rotation = argmax(r) + + print('{:.2f}'.format(-(90-rotation))) + +if __name__ == "__main__": + main(sys.argv[1:]) \ No newline at end of file
