formatting chanages
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/c2a8ac1e Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/c2a8ac1e Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/c2a8ac1e Branch: refs/heads/master Commit: c2a8ac1eabf6bb2a2e25e0715fe3a7adef715816 Parents: 27e999d Author: Zarana Parekh <[email protected]> Authored: Tue Jul 5 19:56:15 2016 -0700 Committer: Zarana Parekh <[email protected]> Committed: Tue Jul 5 19:56:15 2016 -0700 ---------------------------------------------------------------------- .../apache/tika/parser/ocr/TesseractOCRConfig.java | 6 ++---- .../apache/tika/parser/ocr/TesseractOCRParser.java | 16 ++++++++-------- .../tika/parser/ocr/TesseractOCRConfig.properties | 2 +- .../org/apache/tika/parser/ocr/rotation.py | 16 ++++++++-------- .../TesseractOCRConfig-full.properties | 1 - 5 files changed, 19 insertions(+), 22 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/c2a8ac1e/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java index 13bd28e..09daf18 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java @@ -260,8 +260,7 @@ public class TesseractOCRConfig implements Serializable{ * @return timeout value for Tesseract */ public int getTimeout() { return timeout; - } - + } /** @see #setEnableImageProcessing(boolean) * @return image processing is enabled or not */ @@ -432,6 +431,5 @@ public class TesseractOCRConfig implements Serializable{ */ private String getProp(Properties properties, String property, String defaultMissing) { return properties.getProperty(property, defaultMissing); - } - + } } http://git-wip-us.apache.org/repos/asf/tika/blob/c2a8ac1e/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java index ae67425..8183d22 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java @@ -283,12 +283,12 @@ public class TesseractOCRParser extends AbstractParser { // determine the angle of rotation required to make the text horizontal CommandLine cmdLine = CommandLine.parse(cmd); if(hasPython()) { - try { - executor.execute(cmdLine); - angle = outputStream.toString().trim(); - } catch(Exception e) { - e.printStackTrace(); - } + try { + executor.execute(cmdLine); + angle = outputStream.toString().trim(); + } catch(Exception e) { + e.printStackTrace(); + } } // process the image - parameter values can be set in TesseractOCRConfig.properties @@ -327,7 +327,7 @@ public class TesseractOCRParser extends AbstractParser { processImage(tmpFile,config); } - doOCR(tmpFile, tmpImgFile, config); + doOCR(tmpFile, tmpImgFile, config); // Tesseract appends .txt to output file name tmpTxtOutput = new File(tmpImgFile.getAbsolutePath() + ".txt"); @@ -337,7 +337,7 @@ public class TesseractOCRParser extends AbstractParser { extractOutput(is, xhtml); } } - + tmp.close(); } http://git-wip-us.apache.org/repos/asf/tika/blob/c2a8ac1e/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties index fc025d1..4ce66ec 100644 --- a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties +++ b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties @@ -22,7 +22,7 @@ minFileSizeToOcr=0 timeout=120 # properties for image processing -# to enable processing, set enableProcessing to 1 +# to enable processing, set enableImageProcessing to 1 enableImageProcessing=0 ImageMagickPath= density=300 http://git-wip-us.apache.org/repos/asf/tika/blob/c2a8ac1e/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/rotation.py ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/rotation.py b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/rotation.py index b24fabf..0bb7e6a 100644 --- a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/rotation.py +++ b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/rotation.py @@ -16,10 +16,10 @@ """ from __future__ import division, print_function +import numpy from skimage.transform import radon from PIL import Image from numpy import asarray, mean, array, blackman -import numpy from numpy.fft import rfft import matplotlib.pyplot as plt from matplotlib.mlab import rms_flat @@ -31,27 +31,27 @@ def main(argv): filename = '' if len(sys.argv) < 3: - print('Usage: rotation_spacing.py -f <filename>') + print('Usage: rotation.py -f <filename>') sys.exit() try: opts, args = getopt.getopt(argv,"hf:",["file="]) except getopt.GetoptError: - print('rotation_spacing.py -f <filename>') + print('rotation.py -f <filename>') sys.exit(2) for opt, arg in opts: if opt == '-h': - print('Usage: rotation_spacing.py -f <filename>') + print('Usage: rotation.py -f <filename>') sys.exit() elif opt in ("-f", "--file"): filename = arg try: - from parabolic import parabolic + from parabolic import parabolic - def argmax(x): - return parabolic(x, numpy.argmax(x))[0] + def argmax(x): + return parabolic(x, numpy.argmax(x))[0] except ImportError: - from numpy import argmax + from numpy import argmax # Load file, converting to grayscale I = asarray(Image.open(filename).convert('L')) http://git-wip-us.apache.org/repos/asf/tika/blob/c2a8ac1e/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties b/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties index b6393be..3a96ef1 100644 --- a/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties +++ b/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties @@ -21,7 +21,6 @@ maxFileSizeToOcr=2000000 timeout=240 minFileSizeToOcr=1 -enableProcessing=1 ImageMagickPath=/usr/local/bin density=200 depth=8
