This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 24d801e Tika 3272 - Remove usage of rotation.py and Python dependency
(#397)
24d801e is described below
commit 24d801ed3d145a25206b48bea046da8ae3eef88b
Author: Peter Kronenberg <[email protected]>
AuthorDate: Thu Jan 21 11:39:21 2021 -0500
Tika 3272 - Remove usage of rotation.py and Python dependency (#397)
* TIKA-3272 Improve rotation handling; replace rotation.py with Tess4j's
code
* Remove python dependency
* Added Apache header to new files, with attribution to Tess4j
Co-authored-by: Peter Kronenberg <[email protected]>
---
.../apache/tika/parser/ocr/ImagePreprocessor.java | 240 ++++++++-------------
.../apache/tika/parser/ocr/TesseractOCRConfig.java | 31 +--
.../apache/tika/parser/ocr/TesseractOCRParser.java | 18 +-
.../apache/tika/parser/ocr/tess4j/ImageDeskew.java | 153 +++++++++++++
.../apache/tika/parser/ocr/tess4j/ImageUtil.java | 113 ++++++++++
.../tika/parser/ocr/TesseractOCRConfig.properties | 9 +-
.../org/apache/tika/parser/ocr/rotation.py | 73 -------
.../tika/parser/ocr/TesseractOCRParserTest.java | 23 +-
.../{testRotated.png => testRotated+10.png} | Bin
.../resources/test-documents/testRotated-10.png | Bin 0 -> 674259 bytes
10 files changed, 383 insertions(+), 277 deletions(-)
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/ImagePreprocessor.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/ImagePreprocessor.java
index 7f006da..9c84227 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/ImagePreprocessor.java
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/ImagePreprocessor.java
@@ -18,35 +18,39 @@ package org.apache.tika.parser.ocr;
import org.apache.commons.exec.CommandLine;
import org.apache.commons.exec.DefaultExecutor;
-import org.apache.commons.exec.PumpStreamHandler;
import org.apache.commons.io.IOUtils;
-import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.parser.ocr.tess4j.ImageDeskew;
import org.apache.tika.utils.ProcessUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.io.ByteArrayOutputStream;
+import javax.imageio.ImageIO;
+import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
-import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
-import java.nio.file.StandardCopyOption;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
import java.util.Map;
import java.util.concurrent.TimeUnit;
+import java.util.stream.Stream;
class ImagePreprocessor {
- private static final Map<String,Boolean> IMAGE_MAGICK_PRESENT = new
HashMap<>();
- private static final Map<String, Boolean> PYTHON_PRESENT = new HashMap<>();
+ private static final Map<String, Boolean> IMAGE_MAGICK_PRESENT = new
HashMap<>();
private static final Logger LOG =
LoggerFactory.getLogger(TesseractOCRParser.class);
+ private static final double MINIMUM_DESKEW_THRESHOLD = 1.0D;
public static boolean hasImageMagick(TesseractOCRConfig config) {
// Fetch where the config says to find ImageMagick Program
@@ -84,159 +88,94 @@ class ImagePreprocessor {
return config.getImageMagickPath() + getImageMagickProg();
}
- private static String getPythonPath(TesseractOCRConfig config) {
- return config.getPythonPath()+getPythonProg();
- }
- public static boolean hasPython(TesseractOCRConfig config) {
- String pythonPath = getPythonPath(config);
- if (PYTHON_PRESENT.containsKey(pythonPath)) {
- return PYTHON_PRESENT.get(pythonPath);
- }
- //prevent memory bloat
- if (PYTHON_PRESENT.size() > 100) {
- PYTHON_PRESENT.clear();
- }
- //check that directory exists
- if (!config.getPythonPath().isEmpty() &&
- ! Files.isDirectory(Paths.get(config.getPythonPath()))) {
- PYTHON_PRESENT.put(pythonPath, false);
- return false;
- }
+ //this assumes that image magick is available
+ void process(Path sourceFile, Path targFile, Metadata metadata,
+ TesseractOCRConfig config) throws IOException {
- // check if python is installed and it has the required dependencies
for the rotation program to run
- boolean hasPython = false;
- String[] checkCmd = { pythonPath, "--version" };
- boolean hasPythonExecutable = ExternalParser.check(checkCmd);
- if (! hasPythonExecutable) {
- LOG.warn("couldn't run python executable ("+
- pythonPath+")");
- PYTHON_PRESENT.put(pythonPath, hasPythonExecutable);
- return hasPythonExecutable;
- }
+ double angle = config.isApplyRotation()
+ ? getAngle(sourceFile, metadata)
+ : 0d;
- TemporaryResources tmp = null;
- File importCheck = null;
- try {
- tmp = new TemporaryResources();
- importCheck = tmp.createTemporaryFile();
- String prg = "from skimage.transform import radon\n" +
- "from PIL import Image\n" + "" +
- "import numpy\n";
- OutputStreamWriter out = new OutputStreamWriter(new
FileOutputStream(importCheck), Charset.forName("UTF-8"));
- out.write(prg);
- out.flush();
- out.close();
- } catch (IOException e) {
- LOG.warn("Error writing file to test correct libs are available",
e);
- hasPython = false;
- PYTHON_PRESENT.put(pythonPath, hasPython);
- return hasPython;
- }
+ if (config.isEnableImageProcessing() || config.isApplyRotation() &&
angle != 0) {
+ // process the image - parameter values can be set in
TesseractOCRConfig.properties
+ CommandLine commandLine = new
CommandLine(getImageMagickPath(config));
+ if (System.getProperty("os.name").startsWith("Windows")) {
+ commandLine.addArgument("convert");
+ }
- Process p = null;
- try {
- p = Runtime.getRuntime().exec(new String[]{
- pythonPath,
-
ProcessUtils.escapeCommandLine(importCheck.getAbsolutePath())});
- boolean completed = p.waitFor(30, TimeUnit.SECONDS);
- hasPython = completed;
- if (! completed) {
- LOG.warn("python3 did not successfully complete after 30
seconds");
- LOG.warn("rotation.py cannot be called");
+ // Arguments for ImageMagick
+ final List<String> density = Arrays.asList("-density",
Integer.toString(config.getDensity()));
+ final List<String> depth = Arrays.asList("-depth",
Integer.toString(config.getDepth()));
+ final List<String> colorspace = Arrays.asList("-colorspace",
config.getColorspace());
+ final List<String> filter = Arrays.asList("-filter",
config.getFilter());
+ final List<String> resize = Arrays.asList("-resize",
config.getResize() + "%");
+ final List<String> rotate = Arrays.asList("-rotate",
Double.toString(-angle));
+ final List<String> sourceFileArg =
Collections.singletonList(sourceFile.toAbsolutePath().toString());
+ final List<String> targFileArg =
Collections.singletonList(targFile.toAbsolutePath().toString());
+
+ Stream<List<String>> stream = Stream.empty();
+ if (angle == 0) {
+ if (config.isEnableImageProcessing()) {
+ // Do pre-processing, but don't do any rotation
+ stream = Stream.of(
+ density,
+ depth,
+ colorspace,
+ filter,
+ resize,
+ sourceFileArg,
+ targFileArg);
+ }
+ } else if (config.isEnableImageProcessing()) {
+ // Do pre-processing with rotation
+ stream = Stream.of(
+ density,
+ depth,
+ colorspace,
+ filter,
+ resize,
+ rotate,
+ sourceFileArg,
+ targFileArg);
+
+ } else if (config.isApplyRotation()) {
+ // Just rotation
+ stream = Stream.of(
+ rotate,
+ sourceFileArg,
+ targFileArg);
}
- } catch (SecurityException e) {
- throw e;
- } catch (Exception e) {
- LOG.warn("python3 ("+
- pythonPath+ ") is not installed with the required
dependencies: scikit-image and numpy",
- e);
- } finally {
- if (p != null) {
- p.destroyForcibly();
+ final String[] args =
stream.flatMap(Collection::stream).toArray(String[]::new);
+ commandLine.addArguments(args, true);
+ DefaultExecutor executor = new DefaultExecutor();
+ try {
+ executor.execute(commandLine);
+ } catch (SecurityException e) {
+ throw e;
+ } catch (Exception e) {
+ LOG.warn("ImageMagick failed (commandline: " + commandLine +
")", e);
}
- IOUtils.closeQuietly(tmp);
+ metadata.add(TesseractOCRParser.IMAGE_MAGICK, "true");
}
- PYTHON_PRESENT.put(pythonPath, hasPython);
- return hasPython;
}
- //this assumes that image magick is available
- void process(Path sourceFile, Path targFile, Metadata metadata,
- TesseractOCRConfig config) throws TikaException, IOException {
-
- String angle = getAngle(sourceFile, metadata, config);
-
- // process the image - parameter values can be set in
TesseractOCRConfig.properties
- CommandLine commandLine = new CommandLine(getImageMagickPath(config));
- if (System.getProperty("os.name").startsWith("Windows")) {
- commandLine.addArgument("convert");
- }
- String[] args = new String[]{
- "-density", Integer.toString(config.getDensity()),
- "-depth ", Integer.toString(config.getDepth()),
- "-colorspace", config.getColorspace(),
- "-filter", config.getFilter(),
- "-resize", config.getResize() + "%",
- "-rotate", angle,
- sourceFile.toAbsolutePath().toString(),
- targFile.toAbsolutePath().toString()
- };
- commandLine.addArguments(args, true);
- DefaultExecutor executor = new DefaultExecutor();
- try {
- executor.execute(commandLine);
- } catch (SecurityException e) {
- throw e;
- } catch (Exception e) {
- LOG.warn("ImageMagick failed (commandline: "+commandLine+")", e);
+ /**
+ * Get the current skew angle of the image. Positive = clockwise;
Negative = counter-clockwise
+ */
+ private double getAngle(Path sourceFile, Metadata metadata) throws
IOException {
+ BufferedImage bi = ImageIO.read(sourceFile.toFile());
+ ImageDeskew id = new ImageDeskew(bi);
+ double angle = id.getSkewAngle();
+
+ if (angle < MINIMUM_DESKEW_THRESHOLD && angle >
-MINIMUM_DESKEW_THRESHOLD) {
+ LOG.debug("Changing angle " + angle + " to 0.0");
+ angle = 0d;
+ } else {
+ metadata.add(TesseractOCRParser.IMAGE_ROTATION,
String.format(Locale.getDefault(), "%.3f", angle));
}
- metadata.add(TesseractOCRParser.IMAGE_MAGICK, "true");
- }
-
- private String getAngle(Path sourceFile, Metadata metadata,
TesseractOCRConfig config) throws IOException {
- String angle = "0";
- // fetch rotation script from resources
- TemporaryResources tmp = new TemporaryResources();
- File rotationScript = tmp.createTemporaryFile();
- try {
- try (InputStream in =
getClass().getResourceAsStream("rotation.py")) {
- Files.copy(in, rotationScript.toPath(),
StandardCopyOption.REPLACE_EXISTING);
- }
-
- DefaultExecutor executor = new DefaultExecutor();
- // determine the angle of rotation required to make the text
horizontal
- if (config.isApplyRotation() && hasPython(config)) {
- CommandLine commandLine = new
CommandLine(getPythonPath(config));
- String[] args = {"-W",
- "ignore",
- rotationScript.getAbsolutePath(),
- "-f",
- sourceFile.toString()};
- commandLine.addArguments(args, true);
-
- ByteArrayOutputStream outputStream = new
ByteArrayOutputStream();
- PumpStreamHandler streamHandler = new
PumpStreamHandler(outputStream);
- executor.setStreamHandler(streamHandler);
- String tmpAngle = "";
- try {
- executor.execute(commandLine);
- tmpAngle = outputStream.toString("UTF-8").trim();
- //verify that you've gotten a numeric value out
- Double.parseDouble(tmpAngle);
- metadata.add(TesseractOCRParser.IMAGE_ROTATION, tmpAngle);
- angle = tmpAngle;
- } catch (SecurityException e) {
- throw e;
- } catch (Exception e) {
- LOG.warn("rotation.py failed (commandline: " + commandLine
+ ") tmpAngle: " + tmpAngle, e);
- }
- }
- } finally {
- tmp.close();
- }
return angle;
}
@@ -244,9 +183,4 @@ class ImagePreprocessor {
return System.getProperty("os.name").startsWith("Windows") ?
"magick" : "convert";
}
-
- public static String getPythonProg() {
- return "python3";
- }
-
}
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index 307cf07..1c90160 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -92,9 +92,6 @@ public class TesseractOCRConfig implements Serializable {
// Path to ImageMagick program, if not on system path.
private String imageMagickPath = "";
- // Path to the python3 executable, if not on system path
- private String pythonPath = "";
-
// resolution of processed image (in dpi).
private int density = 300;
@@ -126,7 +123,7 @@ public class TesseractOCRConfig implements Serializable {
/**
- * Default contructor.
+ * Default constructor.
*/
public TesseractOCRConfig() {
init(this.getClass().getResourceAsStream("TesseractOCRConfig.properties"));
@@ -197,9 +194,6 @@ public class TesseractOCRConfig implements Serializable {
setApplyRotation(
getProp(props, "applyRotation", isApplyRotation()));
- setPythonPath(
- getProp(props, "pythonPath", getPythonPath()));
-
loadOtherTesseractConfig(props);
}
@@ -558,24 +552,9 @@ public class TesseractOCRConfig implements Serializable {
this.imageMagickPath = imageMagickPath;
}
- /**
- * Path to the directory that contains the Python executable.
- * As of 2.0.0, Tika expects the executable python3 or python3.exe
- * to be in the directory specified by the pythonPath
- *
- * @param pythonPath
- */
- public void setPythonPath(String pythonPath) {
- this.pythonPath = FilenameUtils.normalize(pythonPath);
- }
-
- public String getPythonPath() {
- return pythonPath;
- }
/**
* @return Whether or not a rotation value should be calculated and passed
to ImageMagick before performing OCR.
- * (Requires that Python is installed).
*/
public boolean isApplyRotation() {
return this.applyRotation;
@@ -584,7 +563,7 @@ public class TesseractOCRConfig implements Serializable {
/**
* Sets whether or not a rotation value should be calculated and passed to
ImageMagick.
*
- * @param applyRotation to calculate and apply rotation, false to skip.
Default is false, true required Python installed.
+ * @param applyRotation to calculate and apply rotation, false to skip.
Default is false
*/
public void setApplyRotation(boolean applyRotation) {
this.applyRotation = applyRotation;
@@ -713,10 +692,4 @@ public class TesseractOCRConfig implements Serializable {
}
}
}
-
- void consistencyCheck() {
- if (applyRotation && !enableImageProcessing) {
- LOG.warn("can't apply rotation unless you've also enabled image
processing");
- }
- }
}
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 264295c..04d22c6 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -200,7 +200,7 @@ public class TesseractOCRParser extends AbstractParser {
public void parse(InputStream stream, ContentHandler handler, Metadata
metadata, ParseContext parseContext)
throws IOException, SAXException, TikaException {
TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class,
defaultConfig);
- config.consistencyCheck();
+
// If Tesseract is not on the path with the current config, do not try
to run OCR
// getSupportedTypes shouldn't have listed us as handling it, so this
should only
// occur if someone directly calls this parser, not via DefaultParser
or similar
@@ -239,7 +239,7 @@ public class TesseractOCRParser extends AbstractParser {
if (size >= config.getMinFileSizeToOcr() && size <=
config.getMaxFileSizeToOcr()) {
// Process image
- if (config.isEnableImageProcessing()) {
+ if (config.isEnableImageProcessing() ||
config.isApplyRotation()) {
if (! ImagePreprocessor.hasImageMagick(config)) {
LOG.warn("User has selected to preprocess images, but
I can't find ImageMagick." +
"Backing off to original file.");
@@ -247,17 +247,12 @@ public class TesseractOCRParser extends AbstractParser {
} else {
// copy the contents of the original input file into a
temporary file
// which will be preprocessed for OCR
- TemporaryResources tmp = new TemporaryResources();
- try {
+
+ try (TemporaryResources tmp = new
TemporaryResources()) {
Path tmpFile = tmp.createTempFile();
Files.copy(input, tmpFile,
StandardCopyOption.REPLACE_EXISTING);
- //if image magic is not available
IMAGE_PREPROCESSOR.process(tmpFile, tmpFile,
metadata, config);
doOCR(tmpFile.toFile(), tmpOCROutputFile, config);
- } finally {
- if (tmp != null) {
- tmp.dispose();
- }
}
}
} else {
@@ -585,11 +580,6 @@ public class TesseractOCRParser extends AbstractParser {
defaultConfig.setApplyRotation(applyRotation);
}
- @Field
- public void setPythonPath(String pythonPath) {
- defaultConfig.setPythonPath(pythonPath);
- }
-
public TesseractOCRConfig getDefaultConfig() {
return defaultConfig;
}
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/tess4j/ImageDeskew.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/tess4j/ImageDeskew.java
new file mode 100644
index 0000000..0cad992
--- /dev/null
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/tess4j/ImageDeskew.java
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Copied and pasted from Tess4j (https://sourceforge.net/projects/tess4j/)
+ */
+package org.apache.tika.parser.ocr.tess4j;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.awt.image.BufferedImage;
+
+public class ImageDeskew {
+ private static final Logger LOG =
LoggerFactory.getLogger(ImageDeskew.class);
+
+ private final BufferedImage cImage;
+ private final int cSteps = 200;
+ private double[] cSinA;
+ private double[] cCosA;
+ private double cDMin;
+ private int[] cHMatrix;
+
+ public ImageDeskew(BufferedImage var1) {
+ this.cImage = var1;
+ }
+
+ public double getSkewAngle() {
+ double var2 = 0.0D;
+ int var4 = 0;
+ this.calc();
+ HoughLine[] var1 = this.getTop(20);
+ if (var1.length < 20) {
+ return 0.0D;
+ } else {
+ for (int var5 = 0; var5 < 19; ++var5) {
+ var2 += var1[var5].alpha;
+ ++var4;
+ }
+
+ return var2 / (double) var4;
+ }
+ }
+
+ private HoughLine[] getTop(int var1) {
+ HoughLine[] var2 = new HoughLine[var1];
+
+ for (int var3 = 0; var3 < var1; ++var3) {
+ var2[var3] = new HoughLine();
+ }
+
+ int var4;
+ int var5;
+ for (var4 = 0; var4 < this.cHMatrix.length - 1; ++var4) {
+ if (this.cHMatrix[var4] > var2[var1 - 1].count) {
+ var2[var1 - 1].count = this.cHMatrix[var4];
+ var2[var1 - 1].index = var4;
+
+ for (var5 = var1 - 1; var5 > 0 && var2[var5].count > var2[var5
- 1].count; --var5) {
+ HoughLine var7 = var2[var5];
+ var2[var5] = var2[var5 - 1];
+ var2[var5 - 1] = var7;
+ }
+ }
+ }
+
+ for (int var6 = 0; var6 < var1; ++var6) {
+ var5 = var2[var6].index / this.cSteps;
+ var4 = var2[var6].index - var5 * this.cSteps;
+ var2[var6].alpha = this.getAlpha(var4);
+ var2[var6].d = (double) var5 + this.cDMin;
+ }
+
+ return var2;
+ }
+
+ private void calc() {
+ int var1 = (int) ((double) this.cImage.getHeight() / 4.0D);
+ int var2 = (int) ((double) this.cImage.getHeight() * 3.0D / 4.0D);
+ this.init();
+
+ for (int var3 = var1; var3 < var2; ++var3) {
+ for (int var4 = 1; var4 < this.cImage.getWidth() - 2; ++var4) {
+ if (ImageUtil.isBlack(this.cImage, var4, var3) &&
!ImageUtil.isBlack(this.cImage, var4, var3 + 1)) {
+ this.calc(var4, var3);
+ }
+ }
+ }
+
+ }
+
+ private void calc(int var1, int var2) {
+ for (int var7 = 0; var7 < this.cSteps - 1; ++var7) {
+ double var3 = (double) var2 * this.cCosA[var7] - (double) var1 *
this.cSinA[var7];
+ int var5 = (int) (var3 - this.cDMin);
+ int var6 = var5 * this.cSteps + var7;
+
+ try {
+ this.cHMatrix[var6]++;
+ } catch (Exception var9) {
+ LOG.warn("", var9);
+ }
+
+ }
+
+ }
+
+ private void init() {
+ this.cSinA = new double[this.cSteps - 1];
+ this.cCosA = new double[this.cSteps - 1];
+
+ for (int var3 = 0; var3 < this.cSteps - 1; ++var3) {
+ double var1 = this.getAlpha(var3) * 3.141592653589793D / 180.0D;
+ this.cSinA[var3] = Math.sin(var1);
+ this.cCosA[var3] = Math.cos(var1);
+ }
+
+ this.cDMin = -this.cImage.getWidth();
+ final double cDStep = 1.0D;
+ final int cDCount = (int) (2.0D * (double) (this.cImage.getWidth() +
this.cImage.getHeight()) / cDStep);
+ this.cHMatrix = new int[cDCount * this.cSteps];
+ }
+
+ public double getAlpha(int var1) {
+ final double cAlphaStart = -20.0D;
+ final double cAlphaStep = 0.2D;
+ return cAlphaStart + (double) var1 * cAlphaStep;
+ }
+
+ public static class HoughLine {
+ public int count = 0;
+ public int index = 0;
+ public double alpha;
+ public double d;
+
+ public HoughLine() {
+ }
+ }
+}
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/tess4j/ImageUtil.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/tess4j/ImageUtil.java
new file mode 100644
index 0000000..f7d7155
--- /dev/null
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/tess4j/ImageUtil.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Copied and pasted from Tess4j (https://sourceforge.net/projects/tess4j/)
+ */
+package org.apache.tika.parser.ocr.tess4j;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.awt.Color;
+import java.awt.Graphics2D;
+import java.awt.RenderingHints;
+import java.awt.geom.AffineTransform;
+import java.awt.image.BufferedImage;
+import java.awt.image.WritableRaster;
+
+public class ImageUtil {
+ private static final Logger LOG = LoggerFactory.getLogger(ImageUtil.class);
+
+ public ImageUtil() {
+ }
+
+ public static boolean isBlack(BufferedImage var0, int var1, int var2) {
+ if (var0.getType() == 12) {
+ WritableRaster var5 = var0.getRaster();
+ int var4 = var5.getSample(var1, var2, 0);
+ return var4 == 0;
+ } else {
+ short var3 = 140;
+ return isBlack(var0, var1, var2, var3);
+ }
+ }
+
+ public static boolean isBlack(BufferedImage var0, int var1, int var2, int
var3) {
+ double var8 = 0.0D;
+ if (var1 >= 0 && var2 >= 0 && var1 <= var0.getWidth() && var2 <=
var0.getHeight()) {
+ try {
+ int var4 = var0.getRGB(var1, var2);
+ int var5 = var4 >> 16 & 255;
+ int var6 = var4 >> 8 & 255;
+ int var7 = var4 & 255;
+ var8 = (double) var5 * 0.299D + (double) var6 * 0.587D +
(double) var7 * 0.114D;
+ } catch (Exception var11) {
+ LOG.warn("", var11);
+ }
+
+ return var8 < (double) var3;
+ } else {
+ return false;
+ }
+ }
+
+ public static BufferedImage rotate(BufferedImage var0, double var1, int
var3, int var4) {
+ int var5 = var0.getWidth(null);
+ int var6 = var0.getHeight(null);
+ int var10 = 0;
+ int var9 = 0;
+ int var8 = 0;
+ int var7 = 0;
+ int[] var11 = new int[]{0, 0, var5, 0, var5, var6, 0, var6};
+ double var12 = Math.toRadians(var1);
+
+ for (int var14 = 0; var14 < var11.length; var14 += 2) {
+ int var15 = (int) (Math.cos(var12) * (double) (var11[var14] -
var3) - Math.sin(var12) * (double) (var11[var14 + 1] - var4) + (double) var3);
+ int var16 = (int) (Math.sin(var12) * (double) (var11[var14] -
var3) + Math.cos(var12) * (double) (var11[var14 + 1] - var4) + (double) var4);
+ if (var15 > var9) {
+ var9 = var15;
+ }
+
+ if (var15 < var7) {
+ var7 = var15;
+ }
+
+ if (var16 > var10) {
+ var10 = var16;
+ }
+
+ if (var16 < var8) {
+ var8 = var16;
+ }
+ }
+
+ var3 -= var7;
+ var4 -= var8;
+ BufferedImage var17 = new BufferedImage(var9 - var7, var10 - var8,
var0.getType());
+ Graphics2D var18 = var17.createGraphics();
+ var18.setRenderingHint(RenderingHints.KEY_INTERPOLATION,
RenderingHints.VALUE_INTERPOLATION_BICUBIC);
+ var18.setBackground(Color.white);
+ var18.fillRect(0, 0, var17.getWidth(), var17.getHeight());
+ AffineTransform var19 = new AffineTransform();
+ var19.rotate(var12, var3, var4);
+ var18.setTransform(var19);
+ var18.drawImage(var0, -var7, -var8, null);
+ var18.dispose();
+ return var17;
+ }
+}
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
index 325462a..7eb4792 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
@@ -24,14 +24,15 @@ timeout=120
outputType=txt
preserveInterwordSpacing=false
-# properties for image processing
-# to enable processing, set enableImageProcessing to true
+# If true, correct image rotation
+applyRotation=false
+
+# properties for image pre-processing
+# to enable pre-processing, set enableImageProcessing to true. Requires
ImageMagick
enableImageProcessing=false
ImageMagickPath=
-pythonPath=
density=300
depth=4
colorspace=gray
filter=triangle
resize=200
-applyRotation=false
\ No newline at end of file
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/resources/org/apache/tika/parser/ocr/rotation.py
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/resources/org/apache/tika/parser/ocr/rotation.py
deleted file mode 100644
index c04f699..0000000
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/resources/org/apache/tika/parser/ocr/rotation.py
+++ /dev/null
@@ -1,73 +0,0 @@
-"""
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-
-from skimage.transform import radon
-from PIL import Image
-from numpy import asarray, mean, array, blackman, sqrt, absolute
-
-import sys
-import getopt
-
-def main(argv):
- filename = ''
-
- if len(sys.argv) < 3:
- print('Usage: rotation.py -f <filename>')
- sys.exit()
- try:
- opts, args = getopt.getopt(argv,"hf:",["file="])
- except getopt.GetoptError:
- print('rotation.py -f <filename>')
- sys.exit(2)
- for opt, arg in opts:
- if opt == '-h':
- print('Usage: rotation.py -f <filename>')
- sys.exit()
- elif opt in ("-f", "--file"):
- filename = arg
-
- try:
- from parabolic import parabolic
-
- def argmax(x):
- return parabolic(x, numpy.argmax(x))[0]
- except ImportError:
- from numpy import argmax
-
- # Load file, converting to grayscale
- I = asarray(Image.open(filename).convert('L'))
- I = I - mean(I) # Demean; make the brightness extend above and below
zero
-
- # Do the radon transform and display the result
- sinogram = radon(I)
-
- # Find the RMS value of each row and find "busiest" rotation,
- # where the transform is lined up perfectly with the alternating dark
- # text and white lines
- r = array([rms_flat(line) for line in sinogram.transpose()])
- rotation = argmax(r)
-
- print('{:.2f}'.format(-(90-rotation)))
-
-def rms_flat(a):
- """
- Return the root mean square of all the elements of *a*, flattened out.
- """
- return sqrt(mean(absolute(a)**2))
-
-if __name__ == "__main__":
- main(sys.argv[1:])
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index f91e046..c120467 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -120,18 +120,33 @@ public class TesseractOCRParserTest extends TikaTest {
}
@Test
- public void testRotatedOCR() throws Exception {
+ public void testPositiveRotateOCR() throws Exception {
TesseractOCRConfig config = new TesseractOCRConfig();
- assumeTrue(TesseractOCRParser.IMAGE_PREPROCESSOR.hasPython(config));
assumeTrue(TesseractOCRParser.IMAGE_PREPROCESSOR.hasImageMagick(config));
config.setApplyRotation(true);
- config.setEnableImageProcessing(true);
config.setResize(100);
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, config);
assumeTrue(canRun(config));
Metadata metadata = getMetadata(MediaType.image("png"));
- String ocr = getText("testRotated.png", metadata, parseContext);
+ String ocr = getText("testRotated+10.png", metadata, parseContext);
+ assertEquals("true", metadata.get(TesseractOCRParser.IMAGE_MAGICK));
+ assertEquals(10.0,
+
Double.parseDouble(metadata.get(TesseractOCRParser.IMAGE_ROTATION)), 0.01);
+ assertContains("Its had resolving otherwise she contented therefore",
ocr);
+ }
+
+ @Test
+ public void testNegativeRotateOCR() throws Exception {
+ TesseractOCRConfig config = new TesseractOCRConfig();
+
assumeTrue(TesseractOCRParser.IMAGE_PREPROCESSOR.hasImageMagick(config));
+ config.setApplyRotation(true);
+ config.setResize(100);
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(TesseractOCRConfig.class, config);
+ assumeTrue(canRun(config));
+ Metadata metadata = getMetadata(MediaType.image("png"));
+ String ocr = getText("testRotated-10.png", metadata, parseContext);
assertEquals("true", metadata.get(TesseractOCRParser.IMAGE_MAGICK));
assertEquals(-10.0,
Double.parseDouble(metadata.get(TesseractOCRParser.IMAGE_ROTATION)), 0.01);
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-documents/testRotated.png
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-documents/testRotated+10.png
similarity index 100%
rename from
tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-documents/testRotated.png
rename to
tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-documents/testRotated+10.png
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-documents/testRotated-10.png
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-documents/testRotated-10.png
new file mode 100644
index 0000000..100efc3
Binary files /dev/null and
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-documents/testRotated-10.png
differ