Author: mattmann
Date: Sun Oct 12 16:30:37 2014
New Revision: 1631206

URL: http://svn.apache.org/r1631206
Log:
Fix for TIKA-1422 contributed by tpalsulich and mattmann.

Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java?rev=1631206&r1=1631205&r2=1631206&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
 Sun Oct 12 16:30:37 2014
@@ -29,6 +29,8 @@ import java.io.Reader;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;
+import java.util.List;
+import java.util.ArrayList;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.FutureTask;
@@ -43,6 +45,7 @@ import org.apache.tika.io.TemporaryResou
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.external.ExternalParser;
@@ -97,7 +100,7 @@ public class TesseractOCRParser extends 
        
        public void parse(Image image, ContentHandler handler, Metadata 
metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
-               
+
                TemporaryResources tmp = new TemporaryResources();
                FileOutputStream fos = null;
                TikaInputStream tis = null;
@@ -131,6 +134,7 @@ public class TesseractOCRParser extends 
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
+
        TesseractOCRConfig config = context.get(TesseractOCRConfig.class);
        if(config == null) config = new TesseractOCRConfig();
 
@@ -139,8 +143,7 @@ public class TesseractOCRParser extends 
         if (!ExternalParser.check(checkCmd)) return;
        
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-       xhtml.startDocument();
-       
+
         TemporaryResources tmp = new TemporaryResources();
         File output = null;
         try {
@@ -167,7 +170,6 @@ public class TesseractOCRParser extends 
                        output.delete();
             
         }
-        xhtml.endDocument();
     }
 
        /**
@@ -241,19 +243,21 @@ public class TesseractOCRParser extends 
      * @throws IOException if an input error occurred
      */
     private void extractOutput(InputStream stream, XHTMLContentHandler xhtml)
-            throws SAXException, IOException {
-       
+       throws SAXException, IOException {
+ 
         Reader reader = new InputStreamReader(stream, "UTF-8");
+        xhtml.startDocument();
+        xhtml.startElement("div");
         try {
-            xhtml.startElement("div");
             char[] buffer = new char[1024];
             for (int n = reader.read(buffer); n != -1; n = 
reader.read(buffer)) {
-                xhtml.characters(buffer, 0, n);
+                if (n > 0) xhtml.characters(buffer, 0, n);
             }
-            xhtml.endElement("div");
         } finally {
             reader.close();
         }
+        xhtml.endElement("div");
+        xhtml.endDocument();
     }
 
     /**

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java?rev=1631206&r1=1631205&r2=1631206&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java
 Sun Oct 12 16:30:37 2014
@@ -33,6 +33,12 @@ import static org.junit.Assume.assumeTru
 
 public class TesseractOCRTest  extends TikaTest {
 
+    public static boolean canRun() {
+        TesseractOCRConfig config = new TesseractOCRConfig();
+        TesseractOCRTest tesseractOCRTest = new TesseractOCRTest();
+        return tesseractOCRTest.canRun(config);
+    }
+
     private boolean canRun(TesseractOCRConfig config) {
         String[] checkCmd = {config.getTesseractPath() + "tesseract"};
         // If Tesseract is not on the path, do not run the test.


Reply via email to