Author: nick
Date: Fri Dec 19 05:28:07 2014
New Revision: 1646624
URL: http://svn.apache.org/r1646624
Log:
Temporary workaround for TIKA-1445 for Tika 1.7 - always pass the image to the
regular parser to get the metadata set. Will be replaced in 1.8 with composite
parsers + user selected config with strategy
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java?rev=1646624&r1=1646623&r2=1646624&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
Fri Dec 19 05:28:07 2014
@@ -26,9 +26,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
-import java.util.ArrayList;
import java.util.HashSet;
-import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
@@ -50,7 +48,6 @@ import org.apache.tika.parser.ParseConte
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.parser.image.ImageParser;
-import org.apache.tika.parser.image.PSDParser;
import org.apache.tika.parser.image.TiffParser;
import org.apache.tika.parser.jpeg.JpegParser;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -166,13 +163,31 @@ public class TesseractOCRParser extends
}
+ // Temporary workaround for TIKA-1445 - until we can specify
+ // composite parsers with strategies (eg Composite, Try In Turn),
+ // always send the image onwards to the regular parser to have
+ // the metadata for them extracted as well
+ String type = metadata.get(Metadata.CONTENT_TYPE);
+ if (_TMP_IMG_PARSER.getSupportedTypes(context).contains(type)) {
+ _TMP_IMG_PARSER.parse(tikaStream, handler, metadata, context);
+ }
+ if (_TMP_JPEG_PARSER.getSupportedTypes(context).contains(type)) {
+ _TMP_JPEG_PARSER.parse(tikaStream, handler, metadata, context);
+ }
+ if (_TMP_TIFF_PARSER.getSupportedTypes(context).contains(type)) {
+ _TMP_TIFF_PARSER.parse(tikaStream, handler, metadata, context);
+ }
} finally {
tmp.dispose();
- if (output != null)
+ if (output != null) {
output.delete();
-
+ }
}
}
+ // TIKA-1445 workaround parsers
+ private static Parser _TMP_IMG_PARSER = new ImageParser();
+ private static Parser _TMP_JPEG_PARSER = new JpegParser();
+ private static Parser _TMP_TIFF_PARSER = new TiffParser();
/**
* Run external tesseract-ocr process.