TIKA-2174 add jpx and jp2 to Tesseract
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/f2661f99 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/f2661f99 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/f2661f99 Branch: refs/heads/2.x Commit: f2661f997e69fcaf388561f122b306021928a5d4 Parents: 7422218 Author: tballison <[email protected]> Authored: Wed Nov 9 12:51:51 2016 -0500 Committer: tballison <[email protected]> Committed: Wed Nov 9 12:51:51 2016 -0500 ---------------------------------------------------------------------- .../main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java | 4 +++- .../java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/f2661f99/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java index 2203a7f..a83d419 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java +++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java @@ -95,7 +95,9 @@ public class TesseractOCRParser extends AbstractParser { private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet( new HashSet<MediaType>(Arrays.asList(new MediaType[] { MediaType.image("png"), MediaType.image("jpeg"), MediaType.image("tiff"), - MediaType.image("x-ms-bmp"), MediaType.image("gif") + MediaType.image("x-ms-bmp"), MediaType.image("gif"), + MediaType.APPLICATION_XML.image("jp2"), + MediaType.image("jpx") }))); private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<String, Boolean>(); http://git-wip-us.apache.org/repos/asf/tika/blob/f2661f99/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java index 501364b..8d7e9a9 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java +++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java @@ -91,7 +91,7 @@ public class TesseractOCRParserTest extends TikaTest { // Assuming that Tesseract is on the path, we should find 5 Parsers that support PNG. assumeTrue(canRun()); - assertEquals(5, parser.getSupportedTypes(parseContext).size()); + assertEquals(7, parser.getSupportedTypes(parseContext).size()); assertTrue(parser.getSupportedTypes(parseContext).contains(png)); // DefaultParser will now select the TesseractOCRParser.
