No worries Hong-Thai! Will update and test, thanks! ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Chris Mattmann, Ph.D. Chief Architect Instrument Software and Science Data Systems Section (398) NASA Jet Propulsion Laboratory Pasadena, CA 91109 USA Office: 168-519, Mailstop: 168-527 Email: [email protected] WWW: http://sunset.usc.edu/~mattmann/ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Adjunct Associate Professor, Computer Science Department University of Southern California, Los Angeles, CA 90089 USA ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-----Original Message----- From: Hong-Thai Nguyen <[email protected]> Reply-To: "[email protected]" <[email protected]> Date: Tuesday, October 21, 2014 at 6:57 AM To: "[email protected]" <[email protected]> Subject: Re: svn commit: r1633325 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java test/java/org/apache/tika/parser/mail/RFC822ParserTest.java >Hi Chris, > >Yes, I made a mistake on this commit by missing a renaming file and broke >build, the next commit corrected: >Revision: 1633331 >Author: thaichat04 >Date: mardi 21 octobre 2014 11:47:54 >Message: >TIKA-1422 - Fixing build & minor refactory of naming test class >---- >Modified : >/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822P >arserTest.java >Added : >/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/Tesserac >tOCRParserTest.java >Deleted : >/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/Tesserac >tOCRTest.java > >Please 'pull' latest again then tell me if OK ? > >Sorry > >On Tue, Oct 21, 2014 at 3:49 PM, Mattmann, Chris A (3980) < >[email protected]> wrote: > >> Hi Hong-Thai, >> >> These commits look strange to me - it looks like it subtracts the >> whole files (and the unit test removed the test file, renamed it, >> and then added what largely looks like the same file, back?) >> >> Any idea what¹s up? >> >> Cheers, >> Chris >> >> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ >> Chris Mattmann, Ph.D. >> Chief Architect >> Instrument Software and Science Data Systems Section (398) >> NASA Jet Propulsion Laboratory Pasadena, CA 91109 USA >> Office: 168-519, Mailstop: 168-527 >> Email: [email protected] >> WWW: http://sunset.usc.edu/~mattmann/ >> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ >> Adjunct Associate Professor, Computer Science Department >> University of Southern California, Los Angeles, CA 90089 USA >> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ >> >> >> >> >> >> >> -----Original Message----- >> From: "[email protected]" <[email protected]> >> Reply-To: "[email protected]" <[email protected]> >> Date: Tuesday, October 21, 2014 at 2:32 AM >> To: "[email protected]" <[email protected]> >> Subject: svn commit: r1633325 - in /tika/trunk/tika-parsers/src: >> main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java >> test/java/org/apache/tika/parser/mail/RFC822ParserTest.java >> >> >Author: thaichat04 >> >Date: Tue Oct 21 09:32:06 2014 >> >New Revision: 1633325 >> > >> >URL: http://svn.apache.org/r1633325 >> >Log: >> >TIKA-1422 - Apply fix of [~olegt] in Windows >> > >> >Modified: >> > >> >>>tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/Tessera >>>ct >> >OCRParser.java >> > >> >>>tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822 >>>Pa >> >rserTest.java >> > >> >Modified: >> >>>tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/Tessera >>>ct >> >OCRParser.java >> >URL: >> > >> >>http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/ap >>a >> >>>che/tika/parser/ocr/TesseractOCRParser.java?rev=1633325&r1=1633324&r2=16 >>>33 >> >325&view=diff >> >>>======================================================================== >>>== >> >==== >> >--- >> >>>tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/Tessera >>>ct >> >OCRParser.java (original) >> >+++ >> >>>tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/Tessera >>>ct >> >OCRParser.java Tue Oct 21 09:32:06 2014 >> >@@ -26,11 +26,11 @@ import java.io.IOException; >> > import java.io.InputStream; >> > import java.io.InputStreamReader; >> > import java.io.Reader; >> >+import java.util.ArrayList; >> > import java.util.HashSet; >> >+import java.util.List; >> > import java.util.Map; >> > import java.util.Set; >> >-import java.util.List; >> >-import java.util.ArrayList; >> > import java.util.concurrent.Callable; >> > import java.util.concurrent.ExecutionException; >> > import java.util.concurrent.FutureTask; >> >@@ -45,20 +45,23 @@ import org.apache.tika.io.TemporaryResou >> > import org.apache.tika.io.TikaInputStream; >> > import org.apache.tika.metadata.Metadata; >> > import org.apache.tika.mime.MediaType; >> >-import org.apache.tika.parser.Parser; >> > import org.apache.tika.parser.AbstractParser; >> > import org.apache.tika.parser.ParseContext; >> >+import org.apache.tika.parser.Parser; >> > import org.apache.tika.parser.external.ExternalParser; >> >+import org.apache.tika.parser.image.ImageParser; >> >+import org.apache.tika.parser.image.PSDParser; >> >+import org.apache.tika.parser.image.TiffParser; >> >+import org.apache.tika.parser.jpeg.JpegParser; >> > import org.apache.tika.sax.XHTMLContentHandler; >> > import org.xml.sax.ContentHandler; >> > import org.xml.sax.SAXException; >> > >> > /** >> >- * TesseractOCRParser powered by tesseract-ocr engine. >> >- * To enable this parser, create a {@link TesseractOCRConfig} >> >- * object and pass it through a ParseContext. >> >- * Tesseract-ocr must be installed and on system path or >> >- * the path to its root folder must be provided: >> >+ * TesseractOCRParser powered by tesseract-ocr engine. To enable this >> >parser, >> >+ * create a {@link TesseractOCRConfig} object and pass it through a >> >+ * ParseContext. Tesseract-ocr must be installed and on system path or >> >the path >> >+ * to its root folder must be provided: >> > * <p> >> > * TesseractOCRConfig config = new TesseractOCRConfig();<br> >> > * //Needed if tesseract is not on system path<br> >> >@@ -69,226 +72,231 @@ import org.xml.sax.SAXException; >> > * >> > */ >> > public class TesseractOCRParser extends AbstractParser { >> >- >> >- private static final long serialVersionUID = 1L; >> >- >> >- private static final Set<MediaType> SUPPORTED_TYPES = >>getTypes(); >> >- >> >- private static Set<MediaType> getTypes() { >> >- HashSet<MediaType> supportedTypes = new >> HashSet<MediaType>(); >> >- >> >- supportedTypes.add(MediaType.image("png")); >> >- supportedTypes.add(MediaType.image("jpeg")); >> >- supportedTypes.add(MediaType.image("tiff")); >> >- supportedTypes.add(MediaType.image("x-ms-bmp")); >> >- supportedTypes.add(MediaType.image("gif")); >> >- >> >- return supportedTypes; >> >- } >> >- >> >- @Override >> >- public Set<MediaType> getSupportedTypes(ParseContext arg0) { >> >- return SUPPORTED_TYPES; >> >- } >> >- >> >- private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) >>{ >> >- if(!config.getTesseractPath().isEmpty()){ >> >- Map<String, String> env = pb.environment(); >> >- env.put("TESSDATA_PREFIX", config.getTesseractPath()); >> >- } >> >+ >> >+ private static final long serialVersionUID = 1L; >> >+ >> >+ private static final Set<MediaType> SUPPORTED_TYPES = getTypes(); >> >+ >> >+ private static Set<MediaType> getTypes() { >> >+ HashSet<MediaType> supportedTypes = new HashSet<MediaType>(); >> >+ >> >+ supportedTypes.add(MediaType.image("png")); >> >+ supportedTypes.add(MediaType.image("jpeg")); >> >+ supportedTypes.add(MediaType.image("tiff")); >> >+ supportedTypes.add(MediaType.image("x-ms-bmp")); >> >+ supportedTypes.add(MediaType.image("gif")); >> >+ >> >+ return supportedTypes; >> >+ } >> >+ >> >+ @Override >> >+ public Set<MediaType> getSupportedTypes(ParseContext arg0) { >> >+ return SUPPORTED_TYPES; >> >+ } >> >+ >> >+ private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) { >> >+ if (!config.getTesseractPath().isEmpty()) { >> >+ Map<String, String> env = pb.environment(); >> >+ env.put("TESSDATA_PREFIX", config.getTesseractPath()); >> > } >> >- >> >- public void parse(Image image, ContentHandler handler, Metadata >> >metadata, ParseContext context) >> >- throws IOException, SAXException, TikaException { >> >- >> >- TemporaryResources tmp = new TemporaryResources(); >> >- FileOutputStream fos = null; >> >- TikaInputStream tis = null; >> >- try{ >> >- int w = image.getWidth(null); >> >- int h = image.getHeight(null); >> >- BufferedImage bImage = new BufferedImage(w, h, >> >BufferedImage.TYPE_INT_RGB); >> >- Graphics2D g2 = bImage.createGraphics(); >> >- g2.drawImage(image, 0, 0, null); >> >- g2.dispose(); >> >- File file = tmp.createTemporaryFile(); >> >- fos = new FileOutputStream(file); >> >- ImageIO.write(bImage, "png", fos); >> >- bImage = null; >> >- tis = TikaInputStream.get(file); >> >- parse(tis, handler, metadata, context); >> >- >> >- }finally{ >> >- tmp.dispose(); >> >- if(tis != null) >> >- tis.close(); >> >- if(fos != null) >> >- fos.close(); >> >- } >> >- >> >- >> >- } >> >- >> >- @Override >> >- public void parse( >> >- InputStream stream, ContentHandler handler, >> >- Metadata metadata, ParseContext context) >> >- throws IOException, SAXException, TikaException { >> >- >> >- TesseractOCRConfig config = >>context.get(TesseractOCRConfig.class); >> >- if(config == null) config = new TesseractOCRConfig(); >> >- >> >- String[] checkCmd = {config.getTesseractPath() + "tesseract"}; >> >- // If Tesseract is not on the path, do not try to run OCR. >> >- if (!ExternalParser.check(checkCmd)) return; >> >- >> >- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, >> >metadata); >> >+ } >> > >> >- TemporaryResources tmp = new TemporaryResources(); >> >- File output = null; >> >- try { >> >- TikaInputStream tikaStream = >>TikaInputStream.get(stream, >> tmp); >> >- File input = tikaStream.getFile(); >> >- long size = tikaStream.getLength(); >> >- >> >- if(size >= config.getMinFileSizeToOcr() && size <= >> >config.getMaxFileSizeToOcr()){ >> >- >> >- output = tmp.createTemporaryFile(); >> >- doOCR(input, output, config); >> >- >> >- //Tesseract appends .txt to output file name >> >- output = new File(output.getAbsolutePath() + ".txt"); >> >- >> >- if(output.exists()) >> >- extractOutput(new FileInputStream(output), >>xhtml); >> >+ public void parse(Image image, ContentHandler handler, Metadata >> >metadata, ParseContext context) throws IOException, >> >+ SAXException, TikaException { >> > >> >- } >> >- >> >- } finally { >> >- tmp.dispose(); >> >- if(output != null) >> >- output.delete(); >> >- >> >- } >> >+ TemporaryResources tmp = new TemporaryResources(); >> >+ FileOutputStream fos = null; >> >+ TikaInputStream tis = null; >> >+ try { >> >+ int w = image.getWidth(null); >> >+ int h = image.getHeight(null); >> >+ BufferedImage bImage = new BufferedImage(w, h, >> >BufferedImage.TYPE_INT_RGB); >> >+ Graphics2D g2 = bImage.createGraphics(); >> >+ g2.drawImage(image, 0, 0, null); >> >+ g2.dispose(); >> >+ File file = tmp.createTemporaryFile(); >> >+ fos = new FileOutputStream(file); >> >+ ImageIO.write(bImage, "png", fos); >> >+ bImage = null; >> >+ tis = TikaInputStream.get(file); >> >+ parse(tis, handler, metadata, context); >> >+ >> >+ } finally { >> >+ tmp.dispose(); >> >+ if (tis != null) >> >+ tis.close(); >> >+ if (fos != null) >> >+ fos.close(); >> > } >> > >> >- /** >> >- * Run external tesseract-ocr process. >> >- * @param input File to be ocred >> >- * @param output File to collect ocr result >> >- * @param config Configuration of tesseract-ocr engine >> >- * @throws TikaException if the extraction timed out >> >- * @throws IOException if an input error occurred >> >- */ >> >- private void doOCR(File input, File output, TesseractOCRConfig >> >config) >> >- throws IOException, TikaException { >> >- String[] cmd = {config.getTesseractPath() + "tesseract", >> >- input.getPath(), >> >- output.getPath() , >> >- "-l", >> >- config.getLanguage() , >> >- "-psm", >> >- config.getPageSegMode() >>}; >> >- >> >- ProcessBuilder pb = new ProcessBuilder(cmd); >> >- setEnv(config, pb); >> >- final Process process = pb.start(); >> >- >> >- process.getOutputStream().close(); >> >- InputStream out = process.getInputStream(); >> >- InputStream err = process.getErrorStream(); >> >- >> >- logStream("OCR MSG", out, input); >> >- logStream("OCR ERROR", err, input); >> >- >> >- FutureTask<Integer> waitTask = new FutureTask<Integer>(new >> >Callable<Integer>() { >> >- public Integer call() throws Exception { >> >- return process.waitFor(); >> >- } >> >- }); >> >- >> >- Thread waitThread = new Thread(waitTask); >> >- waitThread.start(); >> >- >> >- try { >> >- waitTask.get(config.getTimeout(), TimeUnit.SECONDS); >> >- >> >- } catch (InterruptedException e) { >> >- waitThread.interrupt(); >> >- process.destroy(); >> >- Thread.currentThread().interrupt(); >> >- throw new TikaException("TesseractOCRParser >>interrupted", >> e); >> >- >> >- } catch (ExecutionException e) { >> >- //should not be thrown >> >- >> >- } catch (TimeoutException e) { >> >- waitThread.interrupt(); >> >- process.destroy(); >> >- throw new TikaException("TesseractOCRParser >> timeout", e); >> >- } >> >- >> >- >> >+ } >> >+ >> >+ @Override >> >+ public void parse(InputStream stream, ContentHandler handler, >>Metadata >> >metadata, ParseContext context) >> >+ throws IOException, SAXException, TikaException { >> >+ >> >+ TesseractOCRConfig config = context.get(TesseractOCRConfig.class); >> >+ if (config == null) >> >+ config = new TesseractOCRConfig(); >> >+ >> >+ String[] checkCmd = { config.getTesseractPath() + "tesseract" }; >> >+ // If Tesseract is not on the path, do not try to run OCR. >> >+ if (!ExternalParser.check(checkCmd)) >> >+ return; >> >+ >> >+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, >> >metadata); >> >+ >> >+ TemporaryResources tmp = new TemporaryResources(); >> >+ File output = null; >> >+ try { >> >+ TikaInputStream tikaStream = TikaInputStream.get(stream, tmp); >> >+ File input = tikaStream.getFile(); >> >+ long size = tikaStream.getLength(); >> >+ >> >+ if (size >= config.getMinFileSizeToOcr() && size <= >> >config.getMaxFileSizeToOcr()) { >> >+ >> >+ output = tmp.createTemporaryFile(); >> >+ doOCR(input, output, config); >> >+ >> >+ // Tesseract appends .txt to output file name >> >+ output = new File(output.getAbsolutePath() + ".txt"); >> >+ >> >+ if (output.exists()) >> >+ extractOutput(new FileInputStream(output), xhtml); >> >+ >> >+ } >> >+ >> >+ } finally { >> >+ tmp.dispose(); >> >+ if (output != null) >> >+ output.delete(); >> >+ >> > } >> >- >> >+ } >> > >> >- /** >> >- * Reads the contents of the given stream and write it to the >> >- * given XHTML content handler. >> >- * The stream is closed once fully processed. >> >- * >> >- * @param stream Stream where is the result of ocr >> >- * @param xhtml XHTML content handler >> >- * @throws SAXException if the XHTML SAX events could not be >>handled >> >- * @throws IOException if an input error occurred >> >- */ >> >- private void extractOutput(InputStream stream, XHTMLContentHandler >> >xhtml) >> >- throws SAXException, IOException { >> >- >> >- Reader reader = new InputStreamReader(stream, "UTF-8"); >> >- xhtml.startDocument(); >> >- xhtml.startElement("div"); >> >- try { >> >- char[] buffer = new char[1024]; >> >- for (int n = reader.read(buffer); n != -1; n = >> >reader.read(buffer)) { >> >- if (n > 0) xhtml.characters(buffer, 0, n); >> >- } >> >- } finally { >> >- reader.close(); >> >- } >> >- xhtml.endElement("div"); >> >- xhtml.endDocument(); >> >+ /** >> >+ * Run external tesseract-ocr process. >> >+ * >> >+ * @param input >> >+ * File to be ocred >> >+ * @param output >> >+ * File to collect ocr result >> >+ * @param config >> >+ * Configuration of tesseract-ocr engine >> >+ * @throws TikaException >> >+ * if the extraction timed out >> >+ * @throws IOException >> >+ * if an input error occurred >> >+ */ >> >+ private void doOCR(File input, File output, TesseractOCRConfig >>config) >> >throws IOException, TikaException { >> >+ String[] cmd = { config.getTesseractPath() + "tesseract", >> >input.getPath(), output.getPath(), "-l", >> >+ config.getLanguage(), "-psm", config.getPageSegMode() }; >> >+ >> >+ ProcessBuilder pb = new ProcessBuilder(cmd); >> >+ setEnv(config, pb); >> >+ final Process process = pb.start(); >> >+ >> >+ process.getOutputStream().close(); >> >+ InputStream out = process.getInputStream(); >> >+ InputStream err = process.getErrorStream(); >> >+ >> >+ logStream("OCR MSG", out, input); >> >+ logStream("OCR ERROR", err, input); >> >+ >> >+ FutureTask<Integer> waitTask = new FutureTask<Integer>(new >> >Callable<Integer>() { >> >+ public Integer call() throws Exception { >> >+ return process.waitFor(); >> >+ } >> >+ }); >> >+ >> >+ Thread waitThread = new Thread(waitTask); >> >+ waitThread.start(); >> >+ >> >+ try { >> >+ waitTask.get(config.getTimeout(), TimeUnit.SECONDS); >> >+ >> >+ } catch (InterruptedException e) { >> >+ waitThread.interrupt(); >> >+ process.destroy(); >> >+ Thread.currentThread().interrupt(); >> >+ throw new TikaException("TesseractOCRParser interrupted", e); >> >+ >> >+ } catch (ExecutionException e) { >> >+ // should not be thrown >> >+ >> >+ } catch (TimeoutException e) { >> >+ waitThread.interrupt(); >> >+ process.destroy(); >> >+ throw new TikaException("TesseractOCRParser timeout", e); >> > } >> > >> >- /** >> >- * Starts a thread that reads the contents of the standard output >> >- * or error stream of the given process to not block the process. >> >- * The stream is closed once fully processed. >> >- */ >> >- private void logStream(final String logType, final InputStream >> >stream, final File file) { >> >- new Thread() { >> >- public void run() { >> >- Reader reader = new InputStreamReader(stream); >> >- StringBuilder out = new StringBuilder(); >> >- char[] buffer = new char[1024]; >> >- try { >> >- for (int n = >>reader.read(buffer); >> n != -1; n = reader.read(buffer)) >> >- out.append(buffer, 0, >>n); >> >- } catch (IOException e) { >> >- >> >- } finally { >> >- IOUtils.closeQuietly(stream); >> >- } >> >- >> >- >> >- String msg = out.toString(); >> >- //log or discard message? >> >- >> >- } >> >- }.start(); >> >+ } >> >+ >> >+ /** >> >+ * Reads the contents of the given stream and write it to the given >> >XHTML >> >+ * content handler. The stream is closed once fully processed. >> >+ * >> >+ * @param stream >> >+ * Stream where is the result of ocr >> >+ * @param xhtml >> >+ * XHTML content handler >> >+ * @throws SAXException >> >+ * if the XHTML SAX events could not be handled >> >+ * @throws IOException >> >+ * if an input error occurred >> >+ */ >> >+ private void extractOutput(InputStream stream, XHTMLContentHandler >> >xhtml) throws SAXException, IOException { >> >+ >> >+ Reader reader = new InputStreamReader(stream, "UTF-8"); >> >+ xhtml.startDocument(); >> >+ xhtml.startElement("div"); >> >+ try { >> >+ char[] buffer = new char[1024]; >> >+ for (int n = reader.read(buffer); n != -1; n = >> >reader.read(buffer)) { >> >+ if (n > 0) >> >+ xhtml.characters(buffer, 0, n); >> >+ } >> >+ } finally { >> >+ reader.close(); >> > } >> >+ xhtml.endElement("div"); >> >+ xhtml.endDocument(); >> >+ } >> >+ >> >+ /** >> >+ * Starts a thread that reads the contents of the standard output or >> >error >> >+ * stream of the given process to not block the process. The stream >>is >> >closed >> >+ * once fully processed. >> >+ */ >> >+ private void logStream(final String logType, final InputStream >>stream, >> >final File file) { >> >+ new Thread() { >> >+ public void run() { >> >+ Reader reader = new InputStreamReader(stream); >> >+ StringBuilder out = new StringBuilder(); >> >+ char[] buffer = new char[1024]; >> >+ try { >> >+ for (int n = reader.read(buffer); n != -1; n = >> >reader.read(buffer)) >> >+ out.append(buffer, 0, n); >> >+ } catch (IOException e) { >> > >> >- >> >-} >> >+ } finally { >> >+ IOUtils.closeQuietly(stream); >> >+ } >> > >> >+ String msg = out.toString(); >> >+ // log or discard message? >> > >> >+ } >> >+ }.start(); >> >+ } >> >+ >> >+ private List<Parser> getImageParsers() { >> >+ List<Parser> parsers = new ArrayList<Parser>(); >> >+ parsers.add(new ImageParser()); >> >+ parsers.add(new PSDParser()); >> >+ parsers.add(new TiffParser()); >> >+ parsers.add(new JpegParser()); >> >+ return parsers; >> >+ } >> >+ >> >+} >> > >> >Modified: >> >>>tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822 >>>Pa >> >rserTest.java >> >URL: >> > >> >>http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/ap >>a >> >>>che/tika/parser/mail/RFC822ParserTest.java?rev=1633325&r1=1633324&r2=163 >>>33 >> >25&view=diff >> >>>======================================================================== >>>== >> >==== >> >--- >> >>>tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822 >>>Pa >> >rserTest.java (original) >> >+++ >> >>>tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822 >>>Pa >> >rserTest.java Tue Oct 21 09:32:06 2014 >> >@@ -36,6 +36,8 @@ import org.apache.tika.metadata.Metadata >> > import org.apache.tika.metadata.TikaCoreProperties; >> > import org.apache.tika.parser.ParseContext; >> > import org.apache.tika.parser.Parser; >> >+import org.apache.tika.parser.ocr.TesseractOCRConfig; >> >+import org.apache.tika.parser.ocr.TesseractOCRParserTest; >> > import org.apache.tika.sax.BodyContentHandler; >> > import org.apache.tika.sax.XHTMLContentHandler; >> > import org.junit.Test; >> >@@ -83,13 +85,19 @@ public class RFC822ParserTest { >> > try { >> > parser.parse(stream, handler, metadata, new >>ParseContext()); >> > verify(handler).startDocument(); >> >- //4 body-part divs -- two outer bodies and two inner >>bodies >> >- verify(handler, >> >times(4)).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), >> >eq("div"), any(Attributes.class)); >> >- verify(handler, >> >times(4)).endElement(XHTMLContentHandler.XHTML, "div", "div"); >> >- //5 paragraph elements, 4 for body-parts and 1 for >> >encompassing message >> >- verify(handler, >> >times(5)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), >> >any(Attributes.class)); >> >- verify(handler, >> >times(5)).endElement(XHTMLContentHandler.XHTML, "p", "p"); >> >+ int bodyExpectedTimes = 4, multipackExpectedTimes = 5;; >> >+ int invokingTimes = bodyExpectedTimes; >> >+ TesseractOCRConfig config = new TesseractOCRConfig(); >> >+ if (TesseractOCRParserTest.canRun(config)) { >> >+ invokingTimes = multipackExpectedTimes; >> >+ } >> >+ >> >+ verify(handler, >> >times(invokingTimes)).startElement(eq(XHTMLContentHandler.XHTML), >> >eq("div"), eq("div"), any(Attributes.class)); >> >+ verify(handler, >> >times(invokingTimes)).endElement(XHTMLContentHandler.XHTML, "div", >>"div"); >> >+ verify(handler, >> >>>times(multipackExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML >>>), >> > eq("p"), eq("p"), any(Attributes.class)); >> >+ verify(handler, >> >times(multipackExpectedTimes)).endElement(XHTMLContentHandler.XHTML, >>"p", >> >"p"); >> > verify(handler).endDocument(); >> >+ >> > } catch (Exception e) { >> > fail("Exception thrown: " + e.getMessage()); >> > } >> > >> > >> >> > > >-- >-------------- >Hong-Thai
