RFC822ParserTest.java

thaichat04 Tue, 21 Oct 2014 02:33:07 -0700

Author: thaichat04
Date: Tue Oct 21 09:32:06 2014
New Revision: 1633325

URL: http://svn.apache.org/r1633325
Log:
TIKA-1422 - Apply fix of [~olegt] in Windows


Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java?rev=1633325&r1=1633324&r2=1633325&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
 Tue Oct 21 09:32:06 2014
@@ -26,11 +26,11 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
+import java.util.ArrayList;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Map;
 import java.util.Set;
-import java.util.List;
-import java.util.ArrayList;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.FutureTask;
@@ -45,20 +45,23 @@ import org.apache.tika.io.TemporaryResou
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.parser.image.ImageParser;
+import org.apache.tika.parser.image.PSDParser;
+import org.apache.tika.parser.image.TiffParser;
+import org.apache.tika.parser.jpeg.JpegParser;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 /**
- * TesseractOCRParser powered by tesseract-ocr engine.
- * To enable this parser, create a {@link TesseractOCRConfig}
- * object and pass it through a ParseContext.
- * Tesseract-ocr must be installed and on system path or
- * the path to its root folder must be provided:
+ * TesseractOCRParser powered by tesseract-ocr engine. To enable this parser,
+ * create a {@link TesseractOCRConfig} object and pass it through a
+ * ParseContext. Tesseract-ocr must be installed and on system path or the path
+ * to its root folder must be provided:
  * <p>
  * TesseractOCRConfig config = new TesseractOCRConfig();<br>
  * //Needed if tesseract is not on system path<br>
@@ -69,226 +72,231 @@ import org.xml.sax.SAXException;
  * 
  */
 public class TesseractOCRParser extends AbstractParser {
-       
-       private static final long serialVersionUID = 1L;
-       
-       private static final Set<MediaType> SUPPORTED_TYPES = getTypes();
-       
-       private static Set<MediaType> getTypes() {
-               HashSet<MediaType> supportedTypes = new HashSet<MediaType>();
-               
-               supportedTypes.add(MediaType.image("png"));
-               supportedTypes.add(MediaType.image("jpeg"));
-               supportedTypes.add(MediaType.image("tiff"));
-               supportedTypes.add(MediaType.image("x-ms-bmp"));
-               supportedTypes.add(MediaType.image("gif"));
-               
-               return supportedTypes;
-       }
-       
-       @Override
-       public Set<MediaType> getSupportedTypes(ParseContext arg0) {
-               return SUPPORTED_TYPES;
-       }
-
-    private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
-        if(!config.getTesseractPath().isEmpty()){
-            Map<String, String> env = pb.environment();
-            env.put("TESSDATA_PREFIX", config.getTesseractPath());
-        }
+
+  private static final long serialVersionUID = 1L;
+
+  private static final Set<MediaType> SUPPORTED_TYPES = getTypes();
+
+  private static Set<MediaType> getTypes() {
+    HashSet<MediaType> supportedTypes = new HashSet<MediaType>();
+
+    supportedTypes.add(MediaType.image("png"));
+    supportedTypes.add(MediaType.image("jpeg"));
+    supportedTypes.add(MediaType.image("tiff"));
+    supportedTypes.add(MediaType.image("x-ms-bmp"));
+    supportedTypes.add(MediaType.image("gif"));
+
+    return supportedTypes;
+  }
+
+  @Override
+  public Set<MediaType> getSupportedTypes(ParseContext arg0) {
+    return SUPPORTED_TYPES;
+  }
+
+  private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
+    if (!config.getTesseractPath().isEmpty()) {
+      Map<String, String> env = pb.environment();
+      env.put("TESSDATA_PREFIX", config.getTesseractPath());
     }
-       
-       public void parse(Image image, ContentHandler handler, Metadata 
metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-
-               TemporaryResources tmp = new TemporaryResources();
-               FileOutputStream fos = null;
-               TikaInputStream tis = null;
-               try{
-                       int w = image.getWidth(null);
-               int h = image.getHeight(null);
-               BufferedImage bImage = new BufferedImage(w, h, 
BufferedImage.TYPE_INT_RGB);
-               Graphics2D g2 = bImage.createGraphics();
-               g2.drawImage(image, 0, 0, null);
-               g2.dispose();
-               File file = tmp.createTemporaryFile();
-                       fos = new FileOutputStream(file);
-                       ImageIO.write(bImage, "png", fos);
-                       bImage = null;
-                       tis = TikaInputStream.get(file);
-                       parse(tis, handler, metadata, context);
-                       
-               }finally{
-                       tmp.dispose();
-                       if(tis != null)
-                               tis.close();
-                       if(fos != null)
-                               fos.close();
-               }
-               
-               
-       }
-
-       @Override
-    public void parse(
-            InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-
-       TesseractOCRConfig config = context.get(TesseractOCRConfig.class);
-       if(config == null) config = new TesseractOCRConfig();
-
-        String[] checkCmd = {config.getTesseractPath() + "tesseract"};
-        // If Tesseract is not on the path, do not try to run OCR.
-        if (!ExternalParser.check(checkCmd)) return;
-       
-       XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+  }
 
-        TemporaryResources tmp = new TemporaryResources();
-        File output = null;
-        try {
-               TikaInputStream  tikaStream = TikaInputStream.get(stream, tmp);
-               File input = tikaStream.getFile();
-               long size = tikaStream.getLength();
-               
-               if(size >= config.getMinFileSizeToOcr() && size <= 
config.getMaxFileSizeToOcr()){
-                       
-               output = tmp.createTemporaryFile();
-               doOCR(input, output, config);
-               
-                //Tesseract appends .txt to output file name
-                output = new File(output.getAbsolutePath() + ".txt");
-                
-                if(output.exists())
-                       extractOutput(new FileInputStream(output), xhtml);
+  public void parse(Image image, ContentHandler handler, Metadata metadata, 
ParseContext context) throws IOException,
+      SAXException, TikaException {
 
-               }
-        
-        } finally {
-               tmp.dispose();
-               if(output != null)
-                       output.delete();
-            
-        }
+    TemporaryResources tmp = new TemporaryResources();
+    FileOutputStream fos = null;
+    TikaInputStream tis = null;
+    try {
+      int w = image.getWidth(null);
+      int h = image.getHeight(null);
+      BufferedImage bImage = new BufferedImage(w, h, 
BufferedImage.TYPE_INT_RGB);
+      Graphics2D g2 = bImage.createGraphics();
+      g2.drawImage(image, 0, 0, null);
+      g2.dispose();
+      File file = tmp.createTemporaryFile();
+      fos = new FileOutputStream(file);
+      ImageIO.write(bImage, "png", fos);
+      bImage = null;
+      tis = TikaInputStream.get(file);
+      parse(tis, handler, metadata, context);
+
+    } finally {
+      tmp.dispose();
+      if (tis != null)
+        tis.close();
+      if (fos != null)
+        fos.close();
     }
 
-       /**
-        * Run external tesseract-ocr process.
-        * @param input File to be ocred
-     * @param output File to collect ocr result
-     * @param config Configuration of tesseract-ocr engine
-     * @throws TikaException if the extraction timed out
-     * @throws IOException if an input error occurred
-        */
-    private void doOCR(File input, File output, TesseractOCRConfig config)
-            throws IOException, TikaException {
-        String[] cmd = {config.getTesseractPath() + "tesseract",
-                                       input.getPath(), 
-                                               output.getPath() , 
-                                               "-l", 
-                                               config.getLanguage() , 
-                                               "-psm", 
-                                               config.getPageSegMode() };
-            
-        ProcessBuilder pb = new ProcessBuilder(cmd);
-        setEnv(config, pb);
-        final Process process = pb.start();
-            
-        process.getOutputStream().close();
-        InputStream out = process.getInputStream();
-        InputStream err = process.getErrorStream();
-            
-        logStream("OCR MSG", out, input);
-        logStream("OCR ERROR", err, input);
-           
-        FutureTask<Integer> waitTask = new FutureTask<Integer>(new 
Callable<Integer>() {
-               public Integer call() throws Exception {
-                   return process.waitFor();
-               }
-        });
-
-        Thread waitThread = new Thread(waitTask);
-        waitThread.start();
-          
-        try {
-               waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
-              
-        } catch (InterruptedException e) {
-               waitThread.interrupt();
-               process.destroy();
-               Thread.currentThread().interrupt();
-               throw new TikaException("TesseractOCRParser interrupted", e);
-               
-        } catch (ExecutionException e) {
-                       //should not be thrown
-                               
-               } catch (TimeoutException e) {
-                       waitThread.interrupt();
-                       process.destroy();
-                       throw new TikaException("TesseractOCRParser timeout", 
e);
-               }
-               
-            
+  }
+
+  @Override
+  public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata, ParseContext context)
+      throws IOException, SAXException, TikaException {
+
+    TesseractOCRConfig config = context.get(TesseractOCRConfig.class);
+    if (config == null)
+      config = new TesseractOCRConfig();
+
+    String[] checkCmd = { config.getTesseractPath() + "tesseract" };
+    // If Tesseract is not on the path, do not try to run OCR.
+    if (!ExternalParser.check(checkCmd))
+      return;
+
+    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
+    TemporaryResources tmp = new TemporaryResources();
+    File output = null;
+    try {
+      TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
+      File input = tikaStream.getFile();
+      long size = tikaStream.getLength();
+
+      if (size >= config.getMinFileSizeToOcr() && size <= 
config.getMaxFileSizeToOcr()) {
+
+        output = tmp.createTemporaryFile();
+        doOCR(input, output, config);
+
+        // Tesseract appends .txt to output file name
+        output = new File(output.getAbsolutePath() + ".txt");
+
+        if (output.exists())
+          extractOutput(new FileInputStream(output), xhtml);
+
+      }
+
+    } finally {
+      tmp.dispose();
+      if (output != null)
+        output.delete();
+
     }
-    
+  }
 
-    /**
-     * Reads the contents of the given stream and write it to the 
-     * given XHTML content handler.
-     * The stream is closed once fully processed.
-     *
-     * @param stream Stream where is the result of ocr
-     * @param xhtml XHTML content handler
-     * @throws SAXException if the XHTML SAX events could not be handled
-     * @throws IOException if an input error occurred
-     */
-    private void extractOutput(InputStream stream, XHTMLContentHandler xhtml)
-       throws SAXException, IOException {
- 
-        Reader reader = new InputStreamReader(stream, "UTF-8");
-        xhtml.startDocument();
-        xhtml.startElement("div");
-        try {
-            char[] buffer = new char[1024];
-            for (int n = reader.read(buffer); n != -1; n = 
reader.read(buffer)) {
-                if (n > 0) xhtml.characters(buffer, 0, n);
-            }
-        } finally {
-            reader.close();
-        }
-        xhtml.endElement("div");
-        xhtml.endDocument();
+  /**
+   * Run external tesseract-ocr process.
+   * 
+   * @param input
+   *          File to be ocred
+   * @param output
+   *          File to collect ocr result
+   * @param config
+   *          Configuration of tesseract-ocr engine
+   * @throws TikaException
+   *           if the extraction timed out
+   * @throws IOException
+   *           if an input error occurred
+   */
+  private void doOCR(File input, File output, TesseractOCRConfig config) 
throws IOException, TikaException {
+    String[] cmd = { config.getTesseractPath() + "tesseract", input.getPath(), 
output.getPath(), "-l",
+        config.getLanguage(), "-psm", config.getPageSegMode() };
+
+    ProcessBuilder pb = new ProcessBuilder(cmd);
+    setEnv(config, pb);
+    final Process process = pb.start();
+
+    process.getOutputStream().close();
+    InputStream out = process.getInputStream();
+    InputStream err = process.getErrorStream();
+
+    logStream("OCR MSG", out, input);
+    logStream("OCR ERROR", err, input);
+
+    FutureTask<Integer> waitTask = new FutureTask<Integer>(new 
Callable<Integer>() {
+      public Integer call() throws Exception {
+        return process.waitFor();
+      }
+    });
+
+    Thread waitThread = new Thread(waitTask);
+    waitThread.start();
+
+    try {
+      waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
+
+    } catch (InterruptedException e) {
+      waitThread.interrupt();
+      process.destroy();
+      Thread.currentThread().interrupt();
+      throw new TikaException("TesseractOCRParser interrupted", e);
+
+    } catch (ExecutionException e) {
+      // should not be thrown
+
+    } catch (TimeoutException e) {
+      waitThread.interrupt();
+      process.destroy();
+      throw new TikaException("TesseractOCRParser timeout", e);
     }
 
-    /**
-     * Starts a thread that reads the contents of the standard output
-     * or error stream of the given process to not block the process.
-     * The stream is closed once fully processed.
-     */
-    private void logStream(final String logType, final InputStream stream, 
final File file) {
-        new Thread() {
-            public void run() {
-               Reader reader = new InputStreamReader(stream);
-                StringBuilder out = new StringBuilder();
-                char[] buffer = new char[1024];
-                try {
-                                       for (int n = reader.read(buffer); n != 
-1; n = reader.read(buffer)) 
-                                               out.append(buffer, 0, n);
-                               } catch (IOException e) {
-                                       
-                               } finally {
-                    IOUtils.closeQuietly(stream);
-                }
-                       
-                               
-                               String msg = out.toString();
-                               //log or discard message?
-                               
-            }
-        }.start();
+  }
+
+  /**
+   * Reads the contents of the given stream and write it to the given XHTML
+   * content handler. The stream is closed once fully processed.
+   * 
+   * @param stream
+   *          Stream where is the result of ocr
+   * @param xhtml
+   *          XHTML content handler
+   * @throws SAXException
+   *           if the XHTML SAX events could not be handled
+   * @throws IOException
+   *           if an input error occurred
+   */
+  private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) 
throws SAXException, IOException {
+
+    Reader reader = new InputStreamReader(stream, "UTF-8");
+    xhtml.startDocument();
+    xhtml.startElement("div");
+    try {
+      char[] buffer = new char[1024];
+      for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
+        if (n > 0)
+          xhtml.characters(buffer, 0, n);
+      }
+    } finally {
+      reader.close();
     }
+    xhtml.endElement("div");
+    xhtml.endDocument();
+  }
+
+  /**
+   * Starts a thread that reads the contents of the standard output or error
+   * stream of the given process to not block the process. The stream is closed
+   * once fully processed.
+   */
+  private void logStream(final String logType, final InputStream stream, final 
File file) {
+    new Thread() {
+      public void run() {
+        Reader reader = new InputStreamReader(stream);
+        StringBuilder out = new StringBuilder();
+        char[] buffer = new char[1024];
+        try {
+          for (int n = reader.read(buffer); n != -1; n = reader.read(buffer))
+            out.append(buffer, 0, n);
+        } catch (IOException e) {
 
-       
-}
+        } finally {
+          IOUtils.closeQuietly(stream);
+        }
 
+        String msg = out.toString();
+        // log or discard message?
 
+      }
+    }.start();
+  }
+
+  private List<Parser> getImageParsers() {
+    List<Parser> parsers = new ArrayList<Parser>();
+    parsers.add(new ImageParser());
+    parsers.add(new PSDParser());
+    parsers.add(new TiffParser());
+    parsers.add(new JpegParser());
+    return parsers;
+  }
+
+}

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java?rev=1633325&r1=1633324&r2=1633325&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
 Tue Oct 21 09:32:06 2014
@@ -36,6 +36,8 @@ import org.apache.tika.metadata.Metadata
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ocr.TesseractOCRConfig;
+import org.apache.tika.parser.ocr.TesseractOCRParserTest;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.junit.Test;
@@ -83,13 +85,19 @@ public class RFC822ParserTest {
         try {
             parser.parse(stream, handler, metadata, new ParseContext());
             verify(handler).startDocument();
-            //4 body-part divs -- two outer bodies and two inner bodies
-            verify(handler, 
times(4)).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), 
any(Attributes.class));
-            verify(handler, times(4)).endElement(XHTMLContentHandler.XHTML, 
"div", "div");
-            //5 paragraph elements, 4 for body-parts and 1 for encompassing 
message
-            verify(handler, 
times(5)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), 
any(Attributes.class));
-            verify(handler, times(5)).endElement(XHTMLContentHandler.XHTML, 
"p", "p");
+            int bodyExpectedTimes = 4, multipackExpectedTimes = 5;;
+            int invokingTimes = bodyExpectedTimes;
+            TesseractOCRConfig config = new TesseractOCRConfig();
+            if (TesseractOCRParserTest.canRun(config)) {
+              invokingTimes = multipackExpectedTimes;
+            }
+            
+            verify(handler, 
times(invokingTimes)).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), 
eq("div"), any(Attributes.class));
+            verify(handler, 
times(invokingTimes)).endElement(XHTMLContentHandler.XHTML, "div", "div");
+            verify(handler, 
times(multipackExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML), 
eq("p"), eq("p"), any(Attributes.class));
+            verify(handler, 
times(multipackExpectedTimes)).endElement(XHTMLContentHandler.XHTML, "p", "p");
             verify(handler).endDocument();
+            
         } catch (Exception e) {
             fail("Exception thrown: " + e.getMessage());
         }

svn commit: r1633325 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java test/java/org/apache/tika/parser/mail/RFC822ParserTest.java

Reply via email to