Author: thaichat04
Date: Tue Oct 21 09:32:06 2014
New Revision: 1633325
URL: http://svn.apache.org/r1633325
Log:
TIKA-1422 - Apply fix of [~olegt] in Windows
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java?rev=1633325&r1=1633324&r2=1633325&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
Tue Oct 21 09:32:06 2014
@@ -26,11 +26,11 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
+import java.util.ArrayList;
import java.util.HashSet;
+import java.util.List;
import java.util.Map;
import java.util.Set;
-import java.util.List;
-import java.util.ArrayList;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.FutureTask;
@@ -45,20 +45,23 @@ import org.apache.tika.io.TemporaryResou
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.Parser;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.parser.image.ImageParser;
+import org.apache.tika.parser.image.PSDParser;
+import org.apache.tika.parser.image.TiffParser;
+import org.apache.tika.parser.jpeg.JpegParser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
- * TesseractOCRParser powered by tesseract-ocr engine.
- * To enable this parser, create a {@link TesseractOCRConfig}
- * object and pass it through a ParseContext.
- * Tesseract-ocr must be installed and on system path or
- * the path to its root folder must be provided:
+ * TesseractOCRParser powered by tesseract-ocr engine. To enable this parser,
+ * create a {@link TesseractOCRConfig} object and pass it through a
+ * ParseContext. Tesseract-ocr must be installed and on system path or the path
+ * to its root folder must be provided:
* <p>
* TesseractOCRConfig config = new TesseractOCRConfig();<br>
* //Needed if tesseract is not on system path<br>
@@ -69,226 +72,231 @@ import org.xml.sax.SAXException;
*
*/
public class TesseractOCRParser extends AbstractParser {
-
- private static final long serialVersionUID = 1L;
-
- private static final Set<MediaType> SUPPORTED_TYPES = getTypes();
-
- private static Set<MediaType> getTypes() {
- HashSet<MediaType> supportedTypes = new HashSet<MediaType>();
-
- supportedTypes.add(MediaType.image("png"));
- supportedTypes.add(MediaType.image("jpeg"));
- supportedTypes.add(MediaType.image("tiff"));
- supportedTypes.add(MediaType.image("x-ms-bmp"));
- supportedTypes.add(MediaType.image("gif"));
-
- return supportedTypes;
- }
-
- @Override
- public Set<MediaType> getSupportedTypes(ParseContext arg0) {
- return SUPPORTED_TYPES;
- }
-
- private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
- if(!config.getTesseractPath().isEmpty()){
- Map<String, String> env = pb.environment();
- env.put("TESSDATA_PREFIX", config.getTesseractPath());
- }
+
+ private static final long serialVersionUID = 1L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES = getTypes();
+
+ private static Set<MediaType> getTypes() {
+ HashSet<MediaType> supportedTypes = new HashSet<MediaType>();
+
+ supportedTypes.add(MediaType.image("png"));
+ supportedTypes.add(MediaType.image("jpeg"));
+ supportedTypes.add(MediaType.image("tiff"));
+ supportedTypes.add(MediaType.image("x-ms-bmp"));
+ supportedTypes.add(MediaType.image("gif"));
+
+ return supportedTypes;
+ }
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext arg0) {
+ return SUPPORTED_TYPES;
+ }
+
+ private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
+ if (!config.getTesseractPath().isEmpty()) {
+ Map<String, String> env = pb.environment();
+ env.put("TESSDATA_PREFIX", config.getTesseractPath());
}
-
- public void parse(Image image, ContentHandler handler, Metadata
metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
-
- TemporaryResources tmp = new TemporaryResources();
- FileOutputStream fos = null;
- TikaInputStream tis = null;
- try{
- int w = image.getWidth(null);
- int h = image.getHeight(null);
- BufferedImage bImage = new BufferedImage(w, h,
BufferedImage.TYPE_INT_RGB);
- Graphics2D g2 = bImage.createGraphics();
- g2.drawImage(image, 0, 0, null);
- g2.dispose();
- File file = tmp.createTemporaryFile();
- fos = new FileOutputStream(file);
- ImageIO.write(bImage, "png", fos);
- bImage = null;
- tis = TikaInputStream.get(file);
- parse(tis, handler, metadata, context);
-
- }finally{
- tmp.dispose();
- if(tis != null)
- tis.close();
- if(fos != null)
- fos.close();
- }
-
-
- }
-
- @Override
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
-
- TesseractOCRConfig config = context.get(TesseractOCRConfig.class);
- if(config == null) config = new TesseractOCRConfig();
-
- String[] checkCmd = {config.getTesseractPath() + "tesseract"};
- // If Tesseract is not on the path, do not try to run OCR.
- if (!ExternalParser.check(checkCmd)) return;
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ }
- TemporaryResources tmp = new TemporaryResources();
- File output = null;
- try {
- TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
- File input = tikaStream.getFile();
- long size = tikaStream.getLength();
-
- if(size >= config.getMinFileSizeToOcr() && size <=
config.getMaxFileSizeToOcr()){
-
- output = tmp.createTemporaryFile();
- doOCR(input, output, config);
-
- //Tesseract appends .txt to output file name
- output = new File(output.getAbsolutePath() + ".txt");
-
- if(output.exists())
- extractOutput(new FileInputStream(output), xhtml);
+ public void parse(Image image, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException,
+ SAXException, TikaException {
- }
-
- } finally {
- tmp.dispose();
- if(output != null)
- output.delete();
-
- }
+ TemporaryResources tmp = new TemporaryResources();
+ FileOutputStream fos = null;
+ TikaInputStream tis = null;
+ try {
+ int w = image.getWidth(null);
+ int h = image.getHeight(null);
+ BufferedImage bImage = new BufferedImage(w, h,
BufferedImage.TYPE_INT_RGB);
+ Graphics2D g2 = bImage.createGraphics();
+ g2.drawImage(image, 0, 0, null);
+ g2.dispose();
+ File file = tmp.createTemporaryFile();
+ fos = new FileOutputStream(file);
+ ImageIO.write(bImage, "png", fos);
+ bImage = null;
+ tis = TikaInputStream.get(file);
+ parse(tis, handler, metadata, context);
+
+ } finally {
+ tmp.dispose();
+ if (tis != null)
+ tis.close();
+ if (fos != null)
+ fos.close();
}
- /**
- * Run external tesseract-ocr process.
- * @param input File to be ocred
- * @param output File to collect ocr result
- * @param config Configuration of tesseract-ocr engine
- * @throws TikaException if the extraction timed out
- * @throws IOException if an input error occurred
- */
- private void doOCR(File input, File output, TesseractOCRConfig config)
- throws IOException, TikaException {
- String[] cmd = {config.getTesseractPath() + "tesseract",
- input.getPath(),
- output.getPath() ,
- "-l",
- config.getLanguage() ,
- "-psm",
- config.getPageSegMode() };
-
- ProcessBuilder pb = new ProcessBuilder(cmd);
- setEnv(config, pb);
- final Process process = pb.start();
-
- process.getOutputStream().close();
- InputStream out = process.getInputStream();
- InputStream err = process.getErrorStream();
-
- logStream("OCR MSG", out, input);
- logStream("OCR ERROR", err, input);
-
- FutureTask<Integer> waitTask = new FutureTask<Integer>(new
Callable<Integer>() {
- public Integer call() throws Exception {
- return process.waitFor();
- }
- });
-
- Thread waitThread = new Thread(waitTask);
- waitThread.start();
-
- try {
- waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
-
- } catch (InterruptedException e) {
- waitThread.interrupt();
- process.destroy();
- Thread.currentThread().interrupt();
- throw new TikaException("TesseractOCRParser interrupted", e);
-
- } catch (ExecutionException e) {
- //should not be thrown
-
- } catch (TimeoutException e) {
- waitThread.interrupt();
- process.destroy();
- throw new TikaException("TesseractOCRParser timeout",
e);
- }
-
-
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata
metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ TesseractOCRConfig config = context.get(TesseractOCRConfig.class);
+ if (config == null)
+ config = new TesseractOCRConfig();
+
+ String[] checkCmd = { config.getTesseractPath() + "tesseract" };
+ // If Tesseract is not on the path, do not try to run OCR.
+ if (!ExternalParser.check(checkCmd))
+ return;
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
+ TemporaryResources tmp = new TemporaryResources();
+ File output = null;
+ try {
+ TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
+ File input = tikaStream.getFile();
+ long size = tikaStream.getLength();
+
+ if (size >= config.getMinFileSizeToOcr() && size <=
config.getMaxFileSizeToOcr()) {
+
+ output = tmp.createTemporaryFile();
+ doOCR(input, output, config);
+
+ // Tesseract appends .txt to output file name
+ output = new File(output.getAbsolutePath() + ".txt");
+
+ if (output.exists())
+ extractOutput(new FileInputStream(output), xhtml);
+
+ }
+
+ } finally {
+ tmp.dispose();
+ if (output != null)
+ output.delete();
+
}
-
+ }
- /**
- * Reads the contents of the given stream and write it to the
- * given XHTML content handler.
- * The stream is closed once fully processed.
- *
- * @param stream Stream where is the result of ocr
- * @param xhtml XHTML content handler
- * @throws SAXException if the XHTML SAX events could not be handled
- * @throws IOException if an input error occurred
- */
- private void extractOutput(InputStream stream, XHTMLContentHandler xhtml)
- throws SAXException, IOException {
-
- Reader reader = new InputStreamReader(stream, "UTF-8");
- xhtml.startDocument();
- xhtml.startElement("div");
- try {
- char[] buffer = new char[1024];
- for (int n = reader.read(buffer); n != -1; n =
reader.read(buffer)) {
- if (n > 0) xhtml.characters(buffer, 0, n);
- }
- } finally {
- reader.close();
- }
- xhtml.endElement("div");
- xhtml.endDocument();
+ /**
+ * Run external tesseract-ocr process.
+ *
+ * @param input
+ * File to be ocred
+ * @param output
+ * File to collect ocr result
+ * @param config
+ * Configuration of tesseract-ocr engine
+ * @throws TikaException
+ * if the extraction timed out
+ * @throws IOException
+ * if an input error occurred
+ */
+ private void doOCR(File input, File output, TesseractOCRConfig config)
throws IOException, TikaException {
+ String[] cmd = { config.getTesseractPath() + "tesseract", input.getPath(),
output.getPath(), "-l",
+ config.getLanguage(), "-psm", config.getPageSegMode() };
+
+ ProcessBuilder pb = new ProcessBuilder(cmd);
+ setEnv(config, pb);
+ final Process process = pb.start();
+
+ process.getOutputStream().close();
+ InputStream out = process.getInputStream();
+ InputStream err = process.getErrorStream();
+
+ logStream("OCR MSG", out, input);
+ logStream("OCR ERROR", err, input);
+
+ FutureTask<Integer> waitTask = new FutureTask<Integer>(new
Callable<Integer>() {
+ public Integer call() throws Exception {
+ return process.waitFor();
+ }
+ });
+
+ Thread waitThread = new Thread(waitTask);
+ waitThread.start();
+
+ try {
+ waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
+
+ } catch (InterruptedException e) {
+ waitThread.interrupt();
+ process.destroy();
+ Thread.currentThread().interrupt();
+ throw new TikaException("TesseractOCRParser interrupted", e);
+
+ } catch (ExecutionException e) {
+ // should not be thrown
+
+ } catch (TimeoutException e) {
+ waitThread.interrupt();
+ process.destroy();
+ throw new TikaException("TesseractOCRParser timeout", e);
}
- /**
- * Starts a thread that reads the contents of the standard output
- * or error stream of the given process to not block the process.
- * The stream is closed once fully processed.
- */
- private void logStream(final String logType, final InputStream stream,
final File file) {
- new Thread() {
- public void run() {
- Reader reader = new InputStreamReader(stream);
- StringBuilder out = new StringBuilder();
- char[] buffer = new char[1024];
- try {
- for (int n = reader.read(buffer); n !=
-1; n = reader.read(buffer))
- out.append(buffer, 0, n);
- } catch (IOException e) {
-
- } finally {
- IOUtils.closeQuietly(stream);
- }
-
-
- String msg = out.toString();
- //log or discard message?
-
- }
- }.start();
+ }
+
+ /**
+ * Reads the contents of the given stream and write it to the given XHTML
+ * content handler. The stream is closed once fully processed.
+ *
+ * @param stream
+ * Stream where is the result of ocr
+ * @param xhtml
+ * XHTML content handler
+ * @throws SAXException
+ * if the XHTML SAX events could not be handled
+ * @throws IOException
+ * if an input error occurred
+ */
+ private void extractOutput(InputStream stream, XHTMLContentHandler xhtml)
throws SAXException, IOException {
+
+ Reader reader = new InputStreamReader(stream, "UTF-8");
+ xhtml.startDocument();
+ xhtml.startElement("div");
+ try {
+ char[] buffer = new char[1024];
+ for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
+ if (n > 0)
+ xhtml.characters(buffer, 0, n);
+ }
+ } finally {
+ reader.close();
}
+ xhtml.endElement("div");
+ xhtml.endDocument();
+ }
+
+ /**
+ * Starts a thread that reads the contents of the standard output or error
+ * stream of the given process to not block the process. The stream is closed
+ * once fully processed.
+ */
+ private void logStream(final String logType, final InputStream stream, final
File file) {
+ new Thread() {
+ public void run() {
+ Reader reader = new InputStreamReader(stream);
+ StringBuilder out = new StringBuilder();
+ char[] buffer = new char[1024];
+ try {
+ for (int n = reader.read(buffer); n != -1; n = reader.read(buffer))
+ out.append(buffer, 0, n);
+ } catch (IOException e) {
-
-}
+ } finally {
+ IOUtils.closeQuietly(stream);
+ }
+ String msg = out.toString();
+ // log or discard message?
+ }
+ }.start();
+ }
+
+ private List<Parser> getImageParsers() {
+ List<Parser> parsers = new ArrayList<Parser>();
+ parsers.add(new ImageParser());
+ parsers.add(new PSDParser());
+ parsers.add(new TiffParser());
+ parsers.add(new JpegParser());
+ return parsers;
+ }
+
+}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java?rev=1633325&r1=1633324&r2=1633325&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
Tue Oct 21 09:32:06 2014
@@ -36,6 +36,8 @@ import org.apache.tika.metadata.Metadata
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ocr.TesseractOCRConfig;
+import org.apache.tika.parser.ocr.TesseractOCRParserTest;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.junit.Test;
@@ -83,13 +85,19 @@ public class RFC822ParserTest {
try {
parser.parse(stream, handler, metadata, new ParseContext());
verify(handler).startDocument();
- //4 body-part divs -- two outer bodies and two inner bodies
- verify(handler,
times(4)).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"),
any(Attributes.class));
- verify(handler, times(4)).endElement(XHTMLContentHandler.XHTML,
"div", "div");
- //5 paragraph elements, 4 for body-parts and 1 for encompassing
message
- verify(handler,
times(5)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"),
any(Attributes.class));
- verify(handler, times(5)).endElement(XHTMLContentHandler.XHTML,
"p", "p");
+ int bodyExpectedTimes = 4, multipackExpectedTimes = 5;;
+ int invokingTimes = bodyExpectedTimes;
+ TesseractOCRConfig config = new TesseractOCRConfig();
+ if (TesseractOCRParserTest.canRun(config)) {
+ invokingTimes = multipackExpectedTimes;
+ }
+
+ verify(handler,
times(invokingTimes)).startElement(eq(XHTMLContentHandler.XHTML), eq("div"),
eq("div"), any(Attributes.class));
+ verify(handler,
times(invokingTimes)).endElement(XHTMLContentHandler.XHTML, "div", "div");
+ verify(handler,
times(multipackExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML),
eq("p"), eq("p"), any(Attributes.class));
+ verify(handler,
times(multipackExpectedTimes)).endElement(XHTMLContentHandler.XHTML, "p", "p");
verify(handler).endDocument();
+
} catch (Exception e) {
fail("Exception thrown: " + e.getMessage());
}