Author: nick
Date: Wed Jan 7 12:03:38 2015
New Revision: 1650045
URL: http://svn.apache.org/r1650045
Log:
Cleaner workaround parser call from Tim Allison from TIKA-1445
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java?rev=1650045&r1=1650044&r2=1650045&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
Wed Jan 7 12:03:38 2015
@@ -26,8 +26,10 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
+import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
+import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
@@ -44,7 +46,9 @@ import org.apache.tika.io.TemporaryResou
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.external.ExternalParser;
@@ -177,16 +181,7 @@ public class TesseractOCRParser extends
// composite parsers with strategies (eg Composite, Try In Turn),
// always send the image onwards to the regular parser to have
// the metadata for them extracted as well
- String type = metadata.get(Metadata.CONTENT_TYPE);
- if (_TMP_IMG_PARSER.getSupportedTypes(context).contains(type)) {
- _TMP_IMG_PARSER.parse(tikaStream, handler, metadata, context);
- }
- if (_TMP_JPEG_PARSER.getSupportedTypes(context).contains(type)) {
- _TMP_JPEG_PARSER.parse(tikaStream, handler, metadata, context);
- }
- if (_TMP_TIFF_PARSER.getSupportedTypes(context).contains(type)) {
- _TMP_TIFF_PARSER.parse(tikaStream, handler, metadata, context);
- }
+ _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, handler, metadata, context);
} finally {
tmp.dispose();
if (output != null) {
@@ -194,10 +189,16 @@ public class TesseractOCRParser extends
}
}
}
- // TIKA-1445 workaround parsers
- private static Parser _TMP_IMG_PARSER = new ImageParser();
- private static Parser _TMP_JPEG_PARSER = new JpegParser();
- private static Parser _TMP_TIFF_PARSER = new TiffParser();
+ // TIKA-1445 workaround parser
+ private static Parser _TMP_IMAGE_METADATA_PARSER = new
CompositeImageParser();
+ private static class CompositeImageParser extends CompositeParser {
+ private static List<Parser> imageParsers = Arrays.asList(new Parser[]{
+ new ImageParser(), new JpegParser(), new TiffParser()
+ });
+ CompositeImageParser() {
+ super(MediaTypeRegistry.getDefaultRegistry(), imageParsers);
+ }
+ }
/**
* Run external tesseract-ocr process.