This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 24d801e  Tika 3272 - Remove usage of rotation.py and Python dependency 
(#397)
24d801e is described below

commit 24d801ed3d145a25206b48bea046da8ae3eef88b
Author: Peter Kronenberg <[email protected]>
AuthorDate: Thu Jan 21 11:39:21 2021 -0500

    Tika 3272 - Remove usage of rotation.py and Python dependency (#397)
    
    * TIKA-3272 Improve rotation handling; replace rotation.py with Tess4j's 
code
    
    * Remove python dependency
    
    * Added Apache header to new files, with attribution to Tess4j
    
    Co-authored-by: Peter Kronenberg <[email protected]>
---
 .../apache/tika/parser/ocr/ImagePreprocessor.java  | 240 ++++++++-------------
 .../apache/tika/parser/ocr/TesseractOCRConfig.java |  31 +--
 .../apache/tika/parser/ocr/TesseractOCRParser.java |  18 +-
 .../apache/tika/parser/ocr/tess4j/ImageDeskew.java | 153 +++++++++++++
 .../apache/tika/parser/ocr/tess4j/ImageUtil.java   | 113 ++++++++++
 .../tika/parser/ocr/TesseractOCRConfig.properties  |   9 +-
 .../org/apache/tika/parser/ocr/rotation.py         |  73 -------
 .../tika/parser/ocr/TesseractOCRParserTest.java    |  23 +-
 .../{testRotated.png => testRotated+10.png}        | Bin
 .../resources/test-documents/testRotated-10.png    | Bin 0 -> 674259 bytes
 10 files changed, 383 insertions(+), 277 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/ImagePreprocessor.java
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/ImagePreprocessor.java
index 7f006da..9c84227 100644
--- 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/ImagePreprocessor.java
+++ 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/ImagePreprocessor.java
@@ -18,35 +18,39 @@ package org.apache.tika.parser.ocr;
 
 import org.apache.commons.exec.CommandLine;
 import org.apache.commons.exec.DefaultExecutor;
-import org.apache.commons.exec.PumpStreamHandler;
 import org.apache.commons.io.IOUtils;
-import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.parser.ocr.tess4j.ImageDeskew;
 import org.apache.tika.utils.ProcessUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.ByteArrayOutputStream;
+import javax.imageio.ImageIO;
+import java.awt.image.BufferedImage;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
-import java.io.InputStream;
 import java.io.OutputStreamWriter;
 import java.nio.charset.Charset;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
-import java.nio.file.StandardCopyOption;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
 import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.concurrent.TimeUnit;
+import java.util.stream.Stream;
 
 class ImagePreprocessor {
-    private static final Map<String,Boolean> IMAGE_MAGICK_PRESENT = new 
HashMap<>();
-    private static final Map<String, Boolean> PYTHON_PRESENT = new HashMap<>();
+    private static final Map<String, Boolean> IMAGE_MAGICK_PRESENT = new 
HashMap<>();
     private static final Logger LOG = 
LoggerFactory.getLogger(TesseractOCRParser.class);
+    private static final double MINIMUM_DESKEW_THRESHOLD = 1.0D;
 
     public static boolean hasImageMagick(TesseractOCRConfig config) {
         // Fetch where the config says to find ImageMagick Program
@@ -84,159 +88,94 @@ class ImagePreprocessor {
         return config.getImageMagickPath() + getImageMagickProg();
     }
 
-    private static String getPythonPath(TesseractOCRConfig config) {
-        return config.getPythonPath()+getPythonProg();
-    }
 
-    public static boolean hasPython(TesseractOCRConfig config) {
-        String pythonPath = getPythonPath(config);
-        if (PYTHON_PRESENT.containsKey(pythonPath)) {
-            return PYTHON_PRESENT.get(pythonPath);
-        }
-        //prevent memory bloat
-        if (PYTHON_PRESENT.size() > 100) {
-            PYTHON_PRESENT.clear();
-        }
-        //check that directory exists
-        if (!config.getPythonPath().isEmpty() &&
-                ! Files.isDirectory(Paths.get(config.getPythonPath()))) {
-            PYTHON_PRESENT.put(pythonPath, false);
-            return false;
-        }
+    //this assumes that image magick is available
+    void process(Path sourceFile, Path targFile, Metadata metadata,
+                 TesseractOCRConfig config) throws IOException {
 
-        // check if python is installed and it has the required dependencies 
for the rotation program to run
-        boolean hasPython = false;
 
-        String[] checkCmd = { pythonPath, "--version" };
-        boolean hasPythonExecutable = ExternalParser.check(checkCmd);
-        if (! hasPythonExecutable) {
-            LOG.warn("couldn't run python executable ("+
-                    pythonPath+")");
-            PYTHON_PRESENT.put(pythonPath, hasPythonExecutable);
-            return hasPythonExecutable;
-        }
+        double angle = config.isApplyRotation()
+                ? getAngle(sourceFile, metadata)
+                : 0d;
 
-        TemporaryResources tmp = null;
-        File importCheck = null;
-        try {
-            tmp = new TemporaryResources();
-            importCheck = tmp.createTemporaryFile();
-            String prg = "from skimage.transform import radon\n" +
-                    "from PIL import Image\n" + "" +
-                    "import numpy\n";
-            OutputStreamWriter out = new OutputStreamWriter(new 
FileOutputStream(importCheck), Charset.forName("UTF-8"));
-            out.write(prg);
-            out.flush();
-            out.close();
-        } catch (IOException e) {
-            LOG.warn("Error writing file to test correct libs are available", 
e);
-            hasPython = false;
-            PYTHON_PRESENT.put(pythonPath, hasPython);
-            return hasPython;
-        }
+        if (config.isEnableImageProcessing() || config.isApplyRotation() && 
angle != 0) {
+            // process the image - parameter values can be set in 
TesseractOCRConfig.properties
+            CommandLine commandLine = new 
CommandLine(getImageMagickPath(config));
+            if (System.getProperty("os.name").startsWith("Windows")) {
+                commandLine.addArgument("convert");
+            }
 
-        Process p = null;
-        try {
-            p = Runtime.getRuntime().exec(new String[]{
-                    pythonPath,
-                    
ProcessUtils.escapeCommandLine(importCheck.getAbsolutePath())});
-            boolean completed = p.waitFor(30, TimeUnit.SECONDS);
-            hasPython = completed;
-            if (! completed) {
-                LOG.warn("python3 did not successfully complete after 30 
seconds");
-                LOG.warn("rotation.py cannot be called");
+            // Arguments for ImageMagick
+            final List<String> density = Arrays.asList("-density", 
Integer.toString(config.getDensity()));
+            final List<String> depth = Arrays.asList("-depth", 
Integer.toString(config.getDepth()));
+            final List<String> colorspace = Arrays.asList("-colorspace", 
config.getColorspace());
+            final List<String> filter = Arrays.asList("-filter", 
config.getFilter());
+            final List<String> resize = Arrays.asList("-resize", 
config.getResize() + "%");
+            final List<String> rotate = Arrays.asList("-rotate", 
Double.toString(-angle));
+            final List<String> sourceFileArg = 
Collections.singletonList(sourceFile.toAbsolutePath().toString());
+            final List<String> targFileArg = 
Collections.singletonList(targFile.toAbsolutePath().toString());
+
+            Stream<List<String>> stream = Stream.empty();
+            if (angle == 0) {
+                if (config.isEnableImageProcessing()) {
+                    // Do pre-processing, but don't do any rotation
+                    stream = Stream.of(
+                            density,
+                            depth,
+                            colorspace,
+                            filter,
+                            resize,
+                            sourceFileArg,
+                            targFileArg);
+                }
+            } else if (config.isEnableImageProcessing()) {
+                // Do pre-processing with rotation
+                stream = Stream.of(
+                        density,
+                        depth,
+                        colorspace,
+                        filter,
+                        resize,
+                        rotate,
+                        sourceFileArg,
+                        targFileArg);
+
+            } else if (config.isApplyRotation()) {
+                // Just rotation
+                stream = Stream.of(
+                        rotate,
+                        sourceFileArg,
+                        targFileArg);
             }
-        } catch (SecurityException e) {
-            throw e;
-        } catch (Exception e) {
-            LOG.warn("python3 ("+
-                            pythonPath+ ") is not installed with the required 
dependencies: scikit-image and numpy",
-                    e);
-        } finally {
-            if (p != null) {
-                p.destroyForcibly();
+            final String[] args = 
stream.flatMap(Collection::stream).toArray(String[]::new);
+            commandLine.addArguments(args, true);
+            DefaultExecutor executor = new DefaultExecutor();
+            try {
+                executor.execute(commandLine);
+            } catch (SecurityException e) {
+                throw e;
+            } catch (Exception e) {
+                LOG.warn("ImageMagick failed (commandline: " + commandLine + 
")", e);
             }
-            IOUtils.closeQuietly(tmp);
+            metadata.add(TesseractOCRParser.IMAGE_MAGICK, "true");
         }
-        PYTHON_PRESENT.put(pythonPath, hasPython);
-        return hasPython;
     }
 
-    //this assumes that image magick is available
-    void process(Path sourceFile, Path targFile, Metadata metadata,
-                 TesseractOCRConfig config) throws TikaException, IOException {
-
-        String angle = getAngle(sourceFile, metadata, config);
-
-        // process the image - parameter values can be set in 
TesseractOCRConfig.properties
-        CommandLine commandLine = new CommandLine(getImageMagickPath(config));
-        if (System.getProperty("os.name").startsWith("Windows")) {
-            commandLine.addArgument("convert");
-        }
-        String[] args = new String[]{
-                "-density", Integer.toString(config.getDensity()),
-                "-depth ", Integer.toString(config.getDepth()),
-                "-colorspace", config.getColorspace(),
-                "-filter", config.getFilter(),
-                "-resize", config.getResize() + "%",
-                "-rotate", angle,
-                sourceFile.toAbsolutePath().toString(),
-                targFile.toAbsolutePath().toString()
-        };
-        commandLine.addArguments(args, true);
-        DefaultExecutor executor = new DefaultExecutor();
-        try {
-            executor.execute(commandLine);
-        } catch (SecurityException e) {
-            throw e;
-        } catch (Exception e) {
-            LOG.warn("ImageMagick failed (commandline: "+commandLine+")", e);
+    /**
+     * Get the current skew angle of the image.  Positive = clockwise; 
Negative = counter-clockwise
+     */
+    private double getAngle(Path sourceFile, Metadata metadata) throws 
IOException {
+        BufferedImage bi = ImageIO.read(sourceFile.toFile());
+        ImageDeskew id = new ImageDeskew(bi);
+        double angle = id.getSkewAngle();
+
+        if (angle < MINIMUM_DESKEW_THRESHOLD && angle > 
-MINIMUM_DESKEW_THRESHOLD) {
+            LOG.debug("Changing angle " + angle  + " to 0.0");
+            angle = 0d;
+        } else {
+            metadata.add(TesseractOCRParser.IMAGE_ROTATION, 
String.format(Locale.getDefault(), "%.3f", angle));
         }
-        metadata.add(TesseractOCRParser.IMAGE_MAGICK, "true");
-    }
-
-    private String getAngle(Path sourceFile, Metadata metadata, 
TesseractOCRConfig config) throws IOException {
-        String angle = "0";
-        // fetch rotation script from resources
-        TemporaryResources tmp = new TemporaryResources();
-        File rotationScript = tmp.createTemporaryFile();
-        try {
-            try (InputStream in = 
getClass().getResourceAsStream("rotation.py")) {
-                Files.copy(in, rotationScript.toPath(), 
StandardCopyOption.REPLACE_EXISTING);
-            }
-
 
-            DefaultExecutor executor = new DefaultExecutor();
-            // determine the angle of rotation required to make the text 
horizontal
-            if (config.isApplyRotation() && hasPython(config)) {
-                CommandLine commandLine = new 
CommandLine(getPythonPath(config));
-                String[] args = {"-W",
-                        "ignore",
-                        rotationScript.getAbsolutePath(),
-                        "-f",
-                        sourceFile.toString()};
-                commandLine.addArguments(args, true);
-
-                ByteArrayOutputStream outputStream = new 
ByteArrayOutputStream();
-                PumpStreamHandler streamHandler = new 
PumpStreamHandler(outputStream);
-                executor.setStreamHandler(streamHandler);
-                String tmpAngle = "";
-                try {
-                    executor.execute(commandLine);
-                    tmpAngle = outputStream.toString("UTF-8").trim();
-                    //verify that you've gotten a numeric value out
-                    Double.parseDouble(tmpAngle);
-                    metadata.add(TesseractOCRParser.IMAGE_ROTATION, tmpAngle);
-                    angle = tmpAngle;
-                } catch (SecurityException e) {
-                    throw e;
-                } catch (Exception e) {
-                    LOG.warn("rotation.py failed (commandline: " + commandLine 
+ ") tmpAngle: " + tmpAngle, e);
-                }
-            }
-        } finally {
-            tmp.close();
-        }
         return angle;
     }
 
@@ -244,9 +183,4 @@ class ImagePreprocessor {
         return System.getProperty("os.name").startsWith("Windows") ?
                 "magick" : "convert";
     }
-
-    public static String getPythonProg() {
-        return "python3";
-    }
-
 }
diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index 307cf07..1c90160 100644
--- 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -92,9 +92,6 @@ public class TesseractOCRConfig implements Serializable {
     // Path to ImageMagick program, if not on system path.
     private String imageMagickPath = "";
 
-    // Path to the python3 executable, if not on system path
-    private String pythonPath = "";
-
     // resolution of processed image (in dpi).
     private int density = 300;
 
@@ -126,7 +123,7 @@ public class TesseractOCRConfig implements Serializable {
 
 
     /**
-     * Default contructor.
+     * Default constructor.
      */
     public TesseractOCRConfig() {
         
init(this.getClass().getResourceAsStream("TesseractOCRConfig.properties"));
@@ -197,9 +194,6 @@ public class TesseractOCRConfig implements Serializable {
         setApplyRotation(
                        getProp(props, "applyRotation", isApplyRotation()));
 
-        setPythonPath(
-                getProp(props, "pythonPath", getPythonPath()));
-
         loadOtherTesseractConfig(props);
     }
 
@@ -558,24 +552,9 @@ public class TesseractOCRConfig implements Serializable {
         this.imageMagickPath = imageMagickPath;
     }
 
-    /**
-     * Path to the directory that contains the Python executable.
-     * As of 2.0.0, Tika expects the executable python3 or python3.exe
-     * to be in the directory specified by the pythonPath
-     *
-     * @param pythonPath
-     */
-    public void setPythonPath(String pythonPath) {
-        this.pythonPath = FilenameUtils.normalize(pythonPath);
-    }
-
-    public String getPythonPath() {
-        return pythonPath;
-    }
 
     /**
      * @return Whether or not a rotation value should be calculated and passed 
to ImageMagick before performing OCR.
-     * (Requires that Python is installed).
      */
     public boolean isApplyRotation() {
        return this.applyRotation;
@@ -584,7 +563,7 @@ public class TesseractOCRConfig implements Serializable {
     /**
      * Sets whether or not a rotation value should be calculated and passed to 
ImageMagick.
      * 
-     * @param applyRotation to calculate and apply rotation, false to skip.  
Default is false, true required Python installed.
+     * @param applyRotation to calculate and apply rotation, false to skip.  
Default is false
      */
     public void setApplyRotation(boolean applyRotation) {
        this.applyRotation = applyRotation;
@@ -713,10 +692,4 @@ public class TesseractOCRConfig implements Serializable {
             }
         }
     }
-
-    void consistencyCheck() {
-        if (applyRotation && !enableImageProcessing) {
-            LOG.warn("can't apply rotation unless you've also enabled image 
processing");
-        }
-    }
 }
diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 264295c..04d22c6 100644
--- 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -200,7 +200,7 @@ public class TesseractOCRParser extends AbstractParser {
     public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata, ParseContext parseContext)
             throws IOException, SAXException, TikaException {
         TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class, 
defaultConfig);
-        config.consistencyCheck();
+
         // If Tesseract is not on the path with the current config, do not try 
to run OCR
         // getSupportedTypes shouldn't have listed us as handling it, so this 
should only
         //  occur if someone directly calls this parser, not via DefaultParser 
or similar
@@ -239,7 +239,7 @@ public class TesseractOCRParser extends AbstractParser {
             if (size >= config.getMinFileSizeToOcr() && size <= 
config.getMaxFileSizeToOcr()) {
 
                // Process image
-               if (config.isEnableImageProcessing()) {
+               if (config.isEnableImageProcessing() || 
config.isApplyRotation()) {
                     if (! ImagePreprocessor.hasImageMagick(config)) {
                         LOG.warn("User has selected to preprocess images, but 
I can't find ImageMagick." +
                                 "Backing off to original file.");
@@ -247,17 +247,12 @@ public class TesseractOCRParser extends AbstractParser {
                     } else {
                         // copy the contents of the original input file into a 
temporary file
                         // which will be preprocessed for OCR
-                        TemporaryResources tmp = new TemporaryResources();
-                        try {
+
+                        try (TemporaryResources tmp = new 
TemporaryResources()) {
                             Path tmpFile = tmp.createTempFile();
                             Files.copy(input, tmpFile, 
StandardCopyOption.REPLACE_EXISTING);
-                            //if image magic is not available
                             IMAGE_PREPROCESSOR.process(tmpFile, tmpFile, 
metadata, config);
                             doOCR(tmpFile.toFile(), tmpOCROutputFile, config);
-                        } finally {
-                            if (tmp != null) {
-                                tmp.dispose();
-                            }
                         }
                     }
                } else {
@@ -585,11 +580,6 @@ public class TesseractOCRParser extends AbstractParser {
         defaultConfig.setApplyRotation(applyRotation);
     }
 
-    @Field
-    public void setPythonPath(String pythonPath) {
-        defaultConfig.setPythonPath(pythonPath);
-    }
-
     public TesseractOCRConfig getDefaultConfig() {
         return defaultConfig;
     }
diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/tess4j/ImageDeskew.java
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/tess4j/ImageDeskew.java
new file mode 100644
index 0000000..0cad992
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/tess4j/ImageDeskew.java
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Copied and pasted from Tess4j (https://sourceforge.net/projects/tess4j/)
+ */
+package org.apache.tika.parser.ocr.tess4j;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.awt.image.BufferedImage;
+
+public class ImageDeskew {
+    private static final Logger LOG = 
LoggerFactory.getLogger(ImageDeskew.class);
+
+    private final BufferedImage cImage;
+    private final int cSteps = 200;
+    private double[] cSinA;
+    private double[] cCosA;
+    private double cDMin;
+    private int[] cHMatrix;
+
+    public ImageDeskew(BufferedImage var1) {
+        this.cImage = var1;
+    }
+
+    public double getSkewAngle() {
+        double var2 = 0.0D;
+        int var4 = 0;
+        this.calc();
+        HoughLine[] var1 = this.getTop(20);
+        if (var1.length < 20) {
+            return 0.0D;
+        } else {
+            for (int var5 = 0; var5 < 19; ++var5) {
+                var2 += var1[var5].alpha;
+                ++var4;
+            }
+
+            return var2 / (double) var4;
+        }
+    }
+
+    private HoughLine[] getTop(int var1) {
+        HoughLine[] var2 = new HoughLine[var1];
+
+        for (int var3 = 0; var3 < var1; ++var3) {
+            var2[var3] = new HoughLine();
+        }
+
+        int var4;
+        int var5;
+        for (var4 = 0; var4 < this.cHMatrix.length - 1; ++var4) {
+            if (this.cHMatrix[var4] > var2[var1 - 1].count) {
+                var2[var1 - 1].count = this.cHMatrix[var4];
+                var2[var1 - 1].index = var4;
+
+                for (var5 = var1 - 1; var5 > 0 && var2[var5].count > var2[var5 
- 1].count; --var5) {
+                    HoughLine var7 = var2[var5];
+                    var2[var5] = var2[var5 - 1];
+                    var2[var5 - 1] = var7;
+                }
+            }
+        }
+
+        for (int var6 = 0; var6 < var1; ++var6) {
+            var5 = var2[var6].index / this.cSteps;
+            var4 = var2[var6].index - var5 * this.cSteps;
+            var2[var6].alpha = this.getAlpha(var4);
+            var2[var6].d = (double) var5 + this.cDMin;
+        }
+
+        return var2;
+    }
+
+    private void calc() {
+        int var1 = (int) ((double) this.cImage.getHeight() / 4.0D);
+        int var2 = (int) ((double) this.cImage.getHeight() * 3.0D / 4.0D);
+        this.init();
+
+        for (int var3 = var1; var3 < var2; ++var3) {
+            for (int var4 = 1; var4 < this.cImage.getWidth() - 2; ++var4) {
+                if (ImageUtil.isBlack(this.cImage, var4, var3) && 
!ImageUtil.isBlack(this.cImage, var4, var3 + 1)) {
+                    this.calc(var4, var3);
+                }
+            }
+        }
+
+    }
+
+    private void calc(int var1, int var2) {
+        for (int var7 = 0; var7 < this.cSteps - 1; ++var7) {
+            double var3 = (double) var2 * this.cCosA[var7] - (double) var1 * 
this.cSinA[var7];
+            int var5 = (int) (var3 - this.cDMin);
+            int var6 = var5 * this.cSteps + var7;
+
+            try {
+                this.cHMatrix[var6]++;
+            } catch (Exception var9) {
+                LOG.warn("", var9);
+            }
+
+        }
+
+    }
+
+    private void init() {
+        this.cSinA = new double[this.cSteps - 1];
+        this.cCosA = new double[this.cSteps - 1];
+
+        for (int var3 = 0; var3 < this.cSteps - 1; ++var3) {
+            double var1 = this.getAlpha(var3) * 3.141592653589793D / 180.0D;
+            this.cSinA[var3] = Math.sin(var1);
+            this.cCosA[var3] = Math.cos(var1);
+        }
+
+        this.cDMin = -this.cImage.getWidth();
+        final double cDStep = 1.0D;
+        final int cDCount = (int) (2.0D * (double) (this.cImage.getWidth() + 
this.cImage.getHeight()) / cDStep);
+        this.cHMatrix = new int[cDCount * this.cSteps];
+    }
+
+    public double getAlpha(int var1) {
+        final double cAlphaStart = -20.0D;
+        final double cAlphaStep = 0.2D;
+        return cAlphaStart + (double) var1 * cAlphaStep;
+    }
+
+    public static class HoughLine {
+        public int count = 0;
+        public int index = 0;
+        public double alpha;
+        public double d;
+
+        public HoughLine() {
+        }
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/tess4j/ImageUtil.java
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/tess4j/ImageUtil.java
new file mode 100644
index 0000000..f7d7155
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/tess4j/ImageUtil.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Copied and pasted from Tess4j (https://sourceforge.net/projects/tess4j/)
+ */
+package org.apache.tika.parser.ocr.tess4j;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.awt.Color;
+import java.awt.Graphics2D;
+import java.awt.RenderingHints;
+import java.awt.geom.AffineTransform;
+import java.awt.image.BufferedImage;
+import java.awt.image.WritableRaster;
+
+public class ImageUtil {
+    private static final Logger LOG = LoggerFactory.getLogger(ImageUtil.class);
+
+    public ImageUtil() {
+    }
+
+    public static boolean isBlack(BufferedImage var0, int var1, int var2) {
+        if (var0.getType() == 12) {
+            WritableRaster var5 = var0.getRaster();
+            int var4 = var5.getSample(var1, var2, 0);
+            return var4 == 0;
+        } else {
+            short var3 = 140;
+            return isBlack(var0, var1, var2, var3);
+        }
+    }
+
+    public static boolean isBlack(BufferedImage var0, int var1, int var2, int 
var3) {
+        double var8 = 0.0D;
+        if (var1 >= 0 && var2 >= 0 && var1 <= var0.getWidth() && var2 <= 
var0.getHeight()) {
+            try {
+                int var4 = var0.getRGB(var1, var2);
+                int var5 = var4 >> 16 & 255;
+                int var6 = var4 >> 8 & 255;
+                int var7 = var4 & 255;
+                var8 = (double) var5 * 0.299D + (double) var6 * 0.587D + 
(double) var7 * 0.114D;
+            } catch (Exception var11) {
+                   LOG.warn("", var11);
+            }
+
+            return var8 < (double) var3;
+        } else {
+            return false;
+        }
+    }
+
+    public static BufferedImage rotate(BufferedImage var0, double var1, int 
var3, int var4) {
+        int var5 = var0.getWidth(null);
+        int var6 = var0.getHeight(null);
+        int var10 = 0;
+        int var9 = 0;
+        int var8 = 0;
+        int var7 = 0;
+        int[] var11 = new int[]{0, 0, var5, 0, var5, var6, 0, var6};
+        double var12 = Math.toRadians(var1);
+
+        for (int var14 = 0; var14 < var11.length; var14 += 2) {
+            int var15 = (int) (Math.cos(var12) * (double) (var11[var14] - 
var3) - Math.sin(var12) * (double) (var11[var14 + 1] - var4) + (double) var3);
+            int var16 = (int) (Math.sin(var12) * (double) (var11[var14] - 
var3) + Math.cos(var12) * (double) (var11[var14 + 1] - var4) + (double) var4);
+            if (var15 > var9) {
+                var9 = var15;
+            }
+
+            if (var15 < var7) {
+                var7 = var15;
+            }
+
+            if (var16 > var10) {
+                var10 = var16;
+            }
+
+            if (var16 < var8) {
+                var8 = var16;
+            }
+        }
+
+        var3 -= var7;
+        var4 -= var8;
+        BufferedImage var17 = new BufferedImage(var9 - var7, var10 - var8, 
var0.getType());
+        Graphics2D var18 = var17.createGraphics();
+        var18.setRenderingHint(RenderingHints.KEY_INTERPOLATION, 
RenderingHints.VALUE_INTERPOLATION_BICUBIC);
+        var18.setBackground(Color.white);
+        var18.fillRect(0, 0, var17.getWidth(), var17.getHeight());
+        AffineTransform var19 = new AffineTransform();
+        var19.rotate(var12, var3, var4);
+        var18.setTransform(var19);
+        var18.drawImage(var0, -var7, -var8, null);
+        var18.dispose();
+        return var17;
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
index 325462a..7eb4792 100644
--- 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
+++ 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
@@ -24,14 +24,15 @@ timeout=120
 outputType=txt
 preserveInterwordSpacing=false
 
-# properties for image processing
-# to enable processing, set enableImageProcessing to true
+# If true, correct image rotation
+applyRotation=false
+
+# properties for image pre-processing
+# to enable pre-processing, set enableImageProcessing to true.  Requires 
ImageMagick
 enableImageProcessing=false
 ImageMagickPath=
-pythonPath=
 density=300
 depth=4
 colorspace=gray
 filter=triangle
 resize=200
-applyRotation=false
\ No newline at end of file
diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/resources/org/apache/tika/parser/ocr/rotation.py
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/resources/org/apache/tika/parser/ocr/rotation.py
deleted file mode 100644
index c04f699..0000000
--- 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/resources/org/apache/tika/parser/ocr/rotation.py
+++ /dev/null
@@ -1,73 +0,0 @@
-"""
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-
-from skimage.transform import radon
-from PIL import Image
-from numpy import asarray, mean, array, blackman, sqrt, absolute
-
-import sys
-import getopt
-
-def main(argv):
-       filename = ''
-       
-       if len(sys.argv) < 3:
-               print('Usage: rotation.py -f <filename>')
-               sys.exit()
-       try:
-         opts, args = getopt.getopt(argv,"hf:",["file="])
-       except getopt.GetoptError:
-         print('rotation.py -f <filename>')
-         sys.exit(2)
-       for opt, arg in opts:
-         if opt == '-h':
-            print('Usage: rotation.py -f <filename>')
-            sys.exit()
-         elif opt in ("-f", "--file"):
-            filename = arg
-
-       try:
-         from parabolic import parabolic
-
-         def argmax(x):
-               return parabolic(x, numpy.argmax(x))[0]
-       except ImportError:
-         from numpy import argmax
-
-       # Load file, converting to grayscale
-       I = asarray(Image.open(filename).convert('L'))
-       I = I - mean(I)  # Demean; make the brightness extend above and below 
zero
-
-       # Do the radon transform and display the result
-       sinogram = radon(I)
-
-       # Find the RMS value of each row and find "busiest" rotation,
-       # where the transform is lined up perfectly with the alternating dark
-       # text and white lines
-       r = array([rms_flat(line) for line in sinogram.transpose()])
-       rotation = argmax(r)
-
-       print('{:.2f}'.format(-(90-rotation)))
-
-def rms_flat(a):
-    """
-    Return the root mean square of all the elements of *a*, flattened out.
-    """
-    return sqrt(mean(absolute(a)**2))
-
-if __name__ == "__main__":
-       main(sys.argv[1:])
diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index f91e046..c120467 100644
--- 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -120,18 +120,33 @@ public class TesseractOCRParserTest extends TikaTest {
     }
 
     @Test
-    public void testRotatedOCR() throws Exception {
+    public void testPositiveRotateOCR() throws Exception {
         TesseractOCRConfig config = new TesseractOCRConfig();
-        assumeTrue(TesseractOCRParser.IMAGE_PREPROCESSOR.hasPython(config));
         
assumeTrue(TesseractOCRParser.IMAGE_PREPROCESSOR.hasImageMagick(config));
         config.setApplyRotation(true);
-        config.setEnableImageProcessing(true);
         config.setResize(100);
         ParseContext parseContext = new ParseContext();
         parseContext.set(TesseractOCRConfig.class, config);
         assumeTrue(canRun(config));
         Metadata metadata = getMetadata(MediaType.image("png"));
-        String ocr = getText("testRotated.png", metadata, parseContext);
+        String ocr = getText("testRotated+10.png", metadata, parseContext);
+        assertEquals("true", metadata.get(TesseractOCRParser.IMAGE_MAGICK));
+        assertEquals(10.0,
+                
Double.parseDouble(metadata.get(TesseractOCRParser.IMAGE_ROTATION)), 0.01);
+        assertContains("Its had resolving otherwise she contented therefore", 
ocr);
+    }
+
+    @Test
+    public void testNegativeRotateOCR() throws Exception {
+        TesseractOCRConfig config = new TesseractOCRConfig();
+        
assumeTrue(TesseractOCRParser.IMAGE_PREPROCESSOR.hasImageMagick(config));
+        config.setApplyRotation(true);
+        config.setResize(100);
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(TesseractOCRConfig.class, config);
+        assumeTrue(canRun(config));
+        Metadata metadata = getMetadata(MediaType.image("png"));
+        String ocr = getText("testRotated-10.png", metadata, parseContext);
         assertEquals("true", metadata.get(TesseractOCRParser.IMAGE_MAGICK));
         assertEquals(-10.0,
                 
Double.parseDouble(metadata.get(TesseractOCRParser.IMAGE_ROTATION)), 0.01);
diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-documents/testRotated.png
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-documents/testRotated+10.png
similarity index 100%
rename from 
tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-documents/testRotated.png
rename to 
tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-documents/testRotated+10.png
diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-documents/testRotated-10.png
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-documents/testRotated-10.png
new file mode 100644
index 0000000..100efc3
Binary files /dev/null and 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-documents/testRotated-10.png
 differ

Reply via email to