Repository: tika Updated Branches: refs/heads/2.x 3f24e6c3e -> 9a68f4ccc
TIKA-2174 -- clean up Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/9a68f4cc Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/9a68f4cc Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/9a68f4cc Branch: refs/heads/2.x Commit: 9a68f4ccc12a633ab1ae7837d561480cc3e0c05c Parents: 3f24e6c Author: tballison <[email protected]> Authored: Thu Nov 10 09:27:40 2016 -0500 Committer: tballison <[email protected]> Committed: Thu Nov 10 09:27:40 2016 -0500 ---------------------------------------------------------------------- .../tika-parser-multimedia-module/pom.xml | 6 ++++++ .../tika/parser/ocr/TesseractOCRParser.java | 2 +- .../org/apache/tika/parser/pdf/PDF2XHTML.java | 21 ++++++++++++++++++-- .../tika/parser/ocr/TesseractOCRParserTest.java | 2 +- 4 files changed, 27 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/9a68f4cc/tika-parser-modules/tika-parser-multimedia-module/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/pom.xml b/tika-parser-modules/tika-parser-multimedia-module/pom.xml index 1f4146c..e0ffec8 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/pom.xml +++ b/tika-parser-modules/tika-parser-multimedia-module/pom.xml @@ -146,6 +146,12 @@ <artifactId>jai-imageio-core</artifactId> <scope>test</scope> </dependency> + <dependency> + <groupId>com.github.jai-imageio</groupId> + <artifactId>jai-imageio-jpeg2000</artifactId> + <version>1.3.0</version> + <scope>test</scope> + </dependency> </dependencies> <build> http://git-wip-us.apache.org/repos/asf/tika/blob/9a68f4cc/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java index 031e8b9..a63eae1 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java +++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java @@ -96,7 +96,7 @@ public class TesseractOCRParser extends AbstractParser { new HashSet<MediaType>(Arrays.asList(new MediaType[] { MediaType.image("png"), MediaType.image("jpeg"), MediaType.image("tiff"), MediaType.image("x-ms-bmp"), MediaType.image("gif"), - MediaType.APPLICATION_XML.image("jp2"), + MediaType.image("jp2"), MediaType.image("jpx"), MediaType.image("x-portable-pixmap") }))); private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<String, Boolean>(); http://git-wip-us.apache.org/repos/asf/tika/blob/9a68f4cc/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java index 0ae8137..b416a61 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java +++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java @@ -34,6 +34,7 @@ import org.apache.commons.io.IOExceptionWithCause; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.cos.COSStream; +import org.apache.pdfbox.filter.MissingImageReaderException; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDResources; @@ -67,6 +68,9 @@ class PDF2XHTML extends AbstractPDF2XHTML { COSName.DCT_DECODE.getName(), COSName.DCT_DECODE_ABBREVIATION.getName()); + private static final List<String> JP2 = + Arrays.asList(COSName.JPX_DECODE.getName()); + /** * This keeps track of the pdf object ids for inline * images that have been processed. @@ -170,12 +174,19 @@ class PDF2XHTML extends AbstractPDF2XHTML { } for (COSName name : resources.getXObjectNames()) { + PDXObject object = null; + try { + object = resources.getXObject(name); + } catch (MissingImageReaderException e) { + EmbeddedDocumentUtil.recordException(e, metadata); + continue; + } - PDXObject object = resources.getXObject(name); if (object == null) { continue; } COSStream cosStream = object.getCOSObject(); + if (seenThisPage.contains(cosStream)) { //avoid infinite recursion TIKA-1742 continue; @@ -198,6 +209,8 @@ class PDF2XHTML extends AbstractPDF2XHTML { } else if (extension.equals("tiff")) { embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/tiff"); extension = "tif"; + } else if (extension.equals("jpx")) { + embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jp2"); } else { //TODO: determine if we need to add more image types //throw new RuntimeException("EXTEN:" + extension); @@ -237,7 +250,7 @@ class PDF2XHTML extends AbstractPDF2XHTML { writeToBuffer(image, extension, buffer); } catch (IOException e) { EmbeddedDocumentUtil.recordException(e, metadata); - return; + continue; } embeddedDocumentExtractor.parseEmbedded( new ByteArrayInputStream(buffer.toByteArray()), @@ -271,6 +284,10 @@ class PDF2XHTML extends AbstractPDF2XHTML { // for CMYK and other "unusual" colorspaces, the JPEG will be converted ImageIOUtil.writeImage(image, suffix, out); } + } else if ("jp2".equals(suffix) || "jpx".equals(suffix)) { + InputStream data = pdImage.createInputStream(JP2); + org.apache.pdfbox.io.IOUtils.copy(data, out); + org.apache.pdfbox.io.IOUtils.closeQuietly(data); } else { ImageIOUtil.writeImage(image, suffix, out); } http://git-wip-us.apache.org/repos/asf/tika/blob/9a68f4cc/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java index ce5531d..c0befa1 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java +++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java @@ -91,7 +91,7 @@ public class TesseractOCRParserTest extends TikaTest { // Assuming that Tesseract is on the path, we should find 5 Parsers that support PNG. assumeTrue(canRun()); - assertEquals(7, parser.getSupportedTypes(parseContext).size()); + assertEquals(8, parser.getSupportedTypes(parseContext).size()); assertTrue(parser.getSupportedTypes(parseContext).contains(png)); // DefaultParser will now select the TesseractOCRParser.
