Author: nick
Date: Fri Dec 19 05:28:07 2014
New Revision: 1646624

URL: http://svn.apache.org/r1646624
Log:
Temporary workaround for TIKA-1445 for Tika 1.7 - always pass the image to the 
regular parser to get the metadata set. Will be replaced in 1.8 with composite 
parsers + user selected config with strategy

Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java?rev=1646624&r1=1646623&r2=1646624&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
 Fri Dec 19 05:28:07 2014
@@ -26,9 +26,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
-import java.util.ArrayList;
 import java.util.HashSet;
-import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.Callable;
@@ -50,7 +48,6 @@ import org.apache.tika.parser.ParseConte
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.external.ExternalParser;
 import org.apache.tika.parser.image.ImageParser;
-import org.apache.tika.parser.image.PSDParser;
 import org.apache.tika.parser.image.TiffParser;
 import org.apache.tika.parser.jpeg.JpegParser;
 import org.apache.tika.sax.XHTMLContentHandler;
@@ -166,13 +163,31 @@ public class TesseractOCRParser extends
 
       }
 
+      // Temporary workaround for TIKA-1445 - until we can specify
+      //  composite parsers with strategies (eg Composite, Try In Turn),
+      //  always send the image onwards to the regular parser to have
+      //  the metadata for them extracted as well
+      String type = metadata.get(Metadata.CONTENT_TYPE);
+      if (_TMP_IMG_PARSER.getSupportedTypes(context).contains(type)) {
+          _TMP_IMG_PARSER.parse(tikaStream, handler, metadata, context);
+      }
+      if (_TMP_JPEG_PARSER.getSupportedTypes(context).contains(type)) {
+          _TMP_JPEG_PARSER.parse(tikaStream, handler, metadata, context);
+      }
+      if (_TMP_TIFF_PARSER.getSupportedTypes(context).contains(type)) {
+          _TMP_TIFF_PARSER.parse(tikaStream, handler, metadata, context);
+      }
     } finally {
       tmp.dispose();
-      if (output != null)
+      if (output != null) {
         output.delete();
-
+      }
     }
   }
+  // TIKA-1445 workaround parsers
+  private static Parser _TMP_IMG_PARSER = new ImageParser();
+  private static Parser _TMP_JPEG_PARSER = new JpegParser();
+  private static Parser _TMP_TIFF_PARSER = new TiffParser();
 
   /**
    * Run external tesseract-ocr process.


Reply via email to