Author: tallison Date: Mon Feb 3 20:11:10 2014 New Revision: 1564042 URL: http://svn.apache.org/r1564042 Log: TIKA-1228: Look for attachments under Kids node if embeddedFiles.getNames() returns null
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_childAttachments.pdf Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1564042&r1=1564041&r2=1564042&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Mon Feb 3 20:11:10 2014 @@ -32,6 +32,7 @@ import org.apache.pdfbox.pdmodel.PDDocum import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.COSObjectable; +import org.apache.pdfbox.pdmodel.common.PDNameTreeNode; import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification; import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile; import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction; @@ -346,48 +347,73 @@ class PDF2XHTML extends PDFTextStripper } private void extractEmbeddedDocuments(PDDocument document, ContentHandler handler) - throws IOException, SAXException, TikaException { - PDDocumentCatalog catalog = document.getDocumentCatalog(); - PDDocumentNameDictionary names = catalog.getNames(); - if (names != null) { - - PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles(); - if (embeddedFiles != null) { - - EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class); - if (embeddedExtractor == null) { - embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context); - } - - Map<String, COSObjectable> embeddedFileNames = embeddedFiles.getNames(); - - if (embeddedFileNames != null) { - for (Map.Entry<String,COSObjectable> ent : embeddedFileNames.entrySet()) { - PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue(); - PDEmbeddedFile file = spec.getEmbeddedFile(); - - Metadata metadata = new Metadata(); - // TODO: other metadata? - metadata.set(Metadata.RESOURCE_NAME_KEY, ent.getKey()); - metadata.set(Metadata.CONTENT_TYPE, file.getSubtype()); - metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize())); - - if (embeddedExtractor.shouldParseEmbedded(metadata)) { - TikaInputStream stream = TikaInputStream.get(file.createInputStream()); - try { - embeddedExtractor.parseEmbedded( - stream, - new EmbeddedContentHandler(handler), - metadata, false); - } finally { - stream.close(); - } - } - } - } - } - } - } + throws IOException, SAXException, TikaException { + PDDocumentCatalog catalog = document.getDocumentCatalog(); + PDDocumentNameDictionary names = catalog.getNames(); + if (names == null){ + return; + } + PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles(); + + if (embeddedFiles == null) { + return; + } + + EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class); + if (embeddedExtractor == null) { + embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context); + } + + Map<String, COSObjectable> embeddedFileNames = embeddedFiles.getNames(); + //For now, try to get the embeddedFileNames out of embeddedFiles or its kids. + //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java + //If there is a need we could add a fully recursive search to find a non-null + //Map<String, COSObjectable> that contains the doc info. + if (embeddedFileNames != null){ + processEmbeddedDocNames(embeddedFileNames, embeddedExtractor); + } else { + List<PDNameTreeNode> kids = embeddedFiles.getKids(); + if (kids == null){ + return; + } + for (PDNameTreeNode n : kids){ + Map<String, COSObjectable> childNames = n.getNames(); + if (childNames != null){ + processEmbeddedDocNames(childNames, embeddedExtractor); + } + } + } + } + + + private void processEmbeddedDocNames(Map<String, COSObjectable> embeddedFileNames, + EmbeddedDocumentExtractor embeddedExtractor) throws IOException, SAXException, TikaException { + if (embeddedFileNames == null){ + return; + } + for (Map.Entry<String,COSObjectable> ent : embeddedFileNames.entrySet()) { + PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue(); + PDEmbeddedFile file = spec.getEmbeddedFile(); + + Metadata metadata = new Metadata(); + // TODO: other metadata? + metadata.set(Metadata.RESOURCE_NAME_KEY, ent.getKey()); + metadata.set(Metadata.CONTENT_TYPE, file.getSubtype()); + metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize())); + + if (embeddedExtractor.shouldParseEmbedded(metadata)) { + TikaInputStream stream = TikaInputStream.get(file.createInputStream()); + try { + embeddedExtractor.parseEmbedded( + stream, + new EmbeddedContentHandler(handler), + metadata, false); + } finally { + stream.close(); + } + } + } + } private void extractAcroForm(PDDocument pdf, XHTMLContentHandler handler) throws IOException, SAXException { //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java?rev=1564042&r1=1564041&r2=1564042&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java Mon Feb 3 20:11:10 2014 @@ -107,7 +107,7 @@ public abstract class TikaTest { * Tries to close input stream after processing. */ public String getText(InputStream is, Parser parser, ParseContext context, Metadata metadata) throws Exception{ - ContentHandler handler = new BodyContentHandler(); + ContentHandler handler = new BodyContentHandler(1000000); try { parser.parse(is, handler, metadata, context); } finally { Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1564042&r1=1564041&r2=1564042&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Mon Feb 3 20:11:10 2014 @@ -48,10 +48,11 @@ import org.xml.sax.ContentHandler; */ public class PDFParserTest extends TikaTest { + public static final MediaType TYPE_TEXT = MediaType.TEXT_PLAIN; public static final MediaType TYPE_EMF = MediaType.application("x-emf"); public static final MediaType TYPE_PDF = MediaType.application("pdf"); public static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document"); - + public static final MediaType TYPE_DOC = MediaType.application("msword"); @Test public void testPdfParsing() throws Exception { @@ -564,7 +565,7 @@ public class PDFParserTest extends TikaT //make sure nothing went wrong with getting the resource to test-documents //This will require modification with each new pdf test. //If this is too annoying, we can turn it off. - assertEquals("Number of pdf files tested", 15, pdfs); + assertEquals("Number of pdf files tested", 16, pdfs); } @@ -625,4 +626,30 @@ public class PDFParserTest extends TikaT String xml = getXML("/testPDF_acroform3.pdf").xml; assertTrue("found", (xml.indexOf("<li>aTextField: TIKA-1226</li>") > -1)); } + + //TIKA-1228 + public void testEmbeddedFilesInChildren() throws Exception { + String xml = getXML("/testPDF_childAttachments.pdf").xml; + //"regressiveness" exists only in Unit10.doc not in the container pdf document + assertTrue(xml.contains("regressiveness")); + + TrackingHandler tracker = new TrackingHandler(); + TikaInputStream tis = null; + ContainerExtractor ex = new ParserContainerExtractor(); + try{ + tis= TikaInputStream.get( + getResourceAsStream("/test-documents/testPDF_childAttachments.pdf")); + ex.extract(tis, ex, tracker); + } finally { + if (tis != null){ + tis.close(); + } + } + assertEquals(2, tracker.filenames.size()); + assertEquals(2, tracker.mediaTypes.size()); + assertEquals("Press Quality(1).joboptions", tracker.filenames.get(0)); + assertEquals("Unit10.doc", tracker.filenames.get(1)); + assertEquals(TYPE_TEXT, tracker.mediaTypes.get(0)); + assertEquals(TYPE_DOC, tracker.mediaTypes.get(1)); + } } Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_childAttachments.pdf URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_childAttachments.pdf?rev=1564042&view=auto ============================================================================== Files tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_childAttachments.pdf (added) and tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_childAttachments.pdf Mon Feb 3 20:11:10 2014 differ