Repository: tika Updated Branches: refs/heads/master 47ba703d6 -> 91cdce43d
TIKA-2175 -- add extraction for inline jp2/jpx from PDFParser Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/91cdce43 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/91cdce43 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/91cdce43 Branch: refs/heads/master Commit: 91cdce43d22cd6726375a83c7842fa299035a258 Parents: 47ba703 Author: tballison <[email protected]> Authored: Wed Nov 9 23:01:13 2016 -0500 Committer: tballison <[email protected]> Committed: Wed Nov 9 23:01:13 2016 -0500 ---------------------------------------------------------------------- tika-parsers/pom.xml | 6 +++++ .../org/apache/tika/parser/pdf/PDF2XHTML.java | 24 ++++++++++++++++---- .../apache/tika/parser/pdf/PDFParserTest.java | 2 ++ 3 files changed, 28 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/91cdce43/tika-parsers/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml index 31a727d..b7f4d38 100644 --- a/tika-parsers/pom.xml +++ b/tika-parsers/pom.xml @@ -334,6 +334,12 @@ <version>1.3.1</version> <scope>test</scope> </dependency> + <dependency> + <groupId>com.github.jai-imageio</groupId> + <artifactId>jai-imageio-jpeg2000</artifactId> + <version>1.3.0</version> + <scope>test</scope> + </dependency> <!-- edu.ucar dependencies --> <dependency> <groupId>edu.ucar</groupId> http://git-wip-us.apache.org/repos/asf/tika/blob/91cdce43/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java index 2a81103..d89dce4 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java @@ -34,6 +34,7 @@ import org.apache.commons.io.IOExceptionWithCause; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.cos.COSStream; +import org.apache.pdfbox.filter.MissingImageReaderException; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDResources; @@ -67,6 +68,9 @@ class PDF2XHTML extends AbstractPDF2XHTML { COSName.DCT_DECODE.getName(), COSName.DCT_DECODE_ABBREVIATION.getName()); + private static final List<String> JP2 = + Arrays.asList(COSName.JPX_DECODE.getName()); + /** * This keeps track of the pdf object ids for inline * images that have been processed. @@ -168,7 +172,14 @@ class PDF2XHTML extends AbstractPDF2XHTML { for (COSName name : resources.getXObjectNames()) { - PDXObject object = resources.getXObject(name); + PDXObject object = null; + try { + object = resources.getXObject(name); + } catch (MissingImageReaderException e) { + EmbeddedDocumentUtil.recordException(e, metadata); + continue; + } + if (object == null) { continue; } @@ -195,11 +206,12 @@ class PDF2XHTML extends AbstractPDF2XHTML { } else if (extension.equals("tiff")) { embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/tiff"); extension = "tif"; + } else if (extension.equals("jpx")) { + embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jp2"); } else { //TODO: determine if we need to add more image types - //throw new RuntimeException("EXTEN:" + extension); +// throw new RuntimeException("EXTEN:" + extension); } - Integer imageNumber = processedInlineImages.get(cosStream); if (imageNumber == null) { imageNumber = inlineImageCounter++; @@ -268,7 +280,11 @@ class PDF2XHTML extends AbstractPDF2XHTML { // for CMYK and other "unusual" colorspaces, the JPEG will be converted ImageIOUtil.writeImage(image, suffix, out); } - } else { + } else if ("jp2".equals(suffix) || "jpx".equals(suffix)) { + InputStream data = pdImage.createInputStream(JP2); + org.apache.pdfbox.io.IOUtils.copy(data, out); + org.apache.pdfbox.io.IOUtils.closeQuietly(data); + } else{ ImageIOUtil.writeImage(image, suffix, out); } } http://git-wip-us.apache.org/repos/asf/tika/blob/91cdce43/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 1f0f4d6..f29f544 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -1292,6 +1292,8 @@ public class PDFParserTest extends TikaTest { } } + //TODO: figure out how to test jp2 embedded with OCR + private void assertException(String path, Parser parser, ParseContext context, Class expected) { boolean noEx = false; InputStream is = getResourceAsStream(path);
