Repository: tika Updated Branches: refs/heads/master ed762b702 -> dbefe9830
TIKA-1857: add basic XFA extraction support via Pascal Essiembre. This closes #74 Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/dbefe983 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/dbefe983 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/dbefe983 Branch: refs/heads/master Commit: dbefe9830b26d05f9ce53503565a069bcc63d7c1 Parents: ed762b7 Author: tballison <[email protected]> Authored: Tue Mar 1 20:58:57 2016 -0500 Committer: tballison <[email protected]> Committed: Tue Mar 1 20:58:57 2016 -0500 ---------------------------------------------------------------------- .../org/apache/tika/parser/pdf/PDF2XHTML.java | 20 ++ .../org/apache/tika/parser/pdf/PDFParser.java | 35 +- .../apache/tika/parser/pdf/PDFParserConfig.java | 36 ++- .../apache/tika/parser/pdf/XFAExtractor.java | 318 +++++++++++++++++++ .../apache/tika/parser/pdf/PDFParser.properties | 3 +- .../apache/tika/parser/pdf/PDFParserTest.java | 32 +- .../testPDF_XFA_govdocs1_258578.pdf | Bin 0 -> 168176 bytes 7 files changed, 440 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/dbefe983/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java index 1ffe60c..d656d5a 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java @@ -16,6 +16,8 @@ */ package org.apache.tika.parser.pdf; +import javax.xml.stream.XMLStreamException; +import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; @@ -63,6 +65,7 @@ import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlin import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm; import org.apache.pdfbox.pdmodel.interactive.form.PDField; import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField; +import org.apache.pdfbox.pdmodel.interactive.form.PDXFA; import org.apache.pdfbox.util.PDFTextStripper; import org.apache.pdfbox.util.TextPosition; import org.apache.tika.exception.TikaException; @@ -99,6 +102,7 @@ class PDF2XHTML extends PDFTextStripper { private final ParseContext context; private final XHTMLContentHandler handler; private final PDFParserConfig config; + private final Metadata metadata; /** * This keeps track of the pdf object ids for inline * images that have been processed. @@ -121,6 +125,7 @@ class PDF2XHTML extends PDFTextStripper { this.originalHandler = handler; this.context = context; this.handler = new XHTMLContentHandler(handler, metadata); + this.metadata = metadata; } /** @@ -581,6 +586,21 @@ class PDF2XHTML extends PDFTextStripper { if (form == null) return; + //if it has xfa, try that. + //if it doesn't exist or there's an exception, + //go with traditional AcroForm + PDXFA pdxfa = form.getXFA(); + if (pdxfa != null) { + XFAExtractor xfaExtractor = new XFAExtractor(); + try { + xfaExtractor.extract(new BufferedInputStream( + new ByteArrayInputStream(pdxfa.getBytes())), handler, metadata); + return; + } catch (XMLStreamException |IOException e) { + //if there was an xml parse exception in xfa, try the AcroForm + } + } + @SuppressWarnings("rawtypes") List fields = form.getFields(); http://git-wip-us.apache.org/repos/asf/tika/blob/dbefe983/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index 01bbc8a..29ebddf 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -16,6 +16,8 @@ */ package org.apache.tika.parser.pdf; +import javax.xml.stream.XMLStreamException; +import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.util.Arrays; @@ -56,6 +58,7 @@ import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.PasswordProvider; +import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -145,7 +148,11 @@ public class PDFParser extends AbstractParser { AccessChecker checker = localConfig.getAccessChecker(); checker.check(metadata); if (handler != null) { - PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig); + if (shouldHandleXFAOnly(pdfDocument, localConfig)) { + handleXFAOnly(pdfDocument, handler, metadata); + } else { + PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig); + } } } catch (CryptographyException e) { @@ -495,6 +502,32 @@ public class PDFParser extends AbstractParser { } } + + private boolean shouldHandleXFAOnly(PDDocument pdDocument, PDFParserConfig config) { + if (config.getIfXFAExtractOnlyXFA() && + pdDocument.getDocumentCatalog() != null && + pdDocument.getDocumentCatalog().getAcroForm() != null && + pdDocument.getDocumentCatalog().getAcroForm().getXFA() != null) { + return true; + } + return false; + } + + private void handleXFAOnly(PDDocument pdDocument, ContentHandler handler, Metadata metadata) + throws SAXException, IOException, TikaException { + XFAExtractor ex = new XFAExtractor(); + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + try { + ex.extract(new ByteArrayInputStream( + pdDocument.getDocumentCatalog().getAcroForm().getXFA().getBytes()), + xhtml, metadata); + } catch (XMLStreamException e) { + throw new TikaException("XML error in XFA", e); + } + xhtml.endDocument(); + } + public PDFParserConfig getPDFParserConfig() { return defaultConfig; } http://git-wip-us.apache.org/repos/asf/tika/blob/dbefe983/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java index 74e67dd..2a650dd 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java @@ -79,6 +79,10 @@ public class PDFParserConfig implements Serializable { //The space width-based tolerance value used to estimate where spaces in text should be added private Float spacingTolerance; + //If the PDF has an XFA element, process only that and skip extracting + //content from elsewhere in the document. + private boolean ifXFAExtractOnlyXFA = false; + private AccessChecker accessChecker; public PDFParserConfig() { @@ -139,6 +143,10 @@ public class PDFParserConfig implements Serializable { getProp(props.getProperty("extractUniqueInlineImagesOnly"), getExtractUniqueInlineImagesOnly())); + setIfXFAExtractOnlyXFA( + getProp(props.getProperty("ifXFAExtractOnlyXFA"), + getIfXFAExtractOnlyXFA())); + boolean checkExtractAccessPermission = getProp(props.getProperty("checkExtractAccessPermission"), false); boolean allowExtractionForAccessibility = getProp(props.getProperty("allowExtractionForAccessibility"), true); @@ -182,7 +190,8 @@ public class PDFParserConfig implements Serializable { /** * If true (the default), extract content from AcroForms - * at the end of the document. + * at the end of the document. If an XFA is found, + * try to process that, otherwise, process the AcroForm. * * @param extractAcroFormContent */ @@ -192,6 +201,26 @@ public class PDFParserConfig implements Serializable { } /** + * @see #setIfXFAExtractOnlyXFA(boolean) + * @return how to handle XFA data if it exists + */ + public boolean getIfXFAExtractOnlyXFA() { + return ifXFAExtractOnlyXFA; + } + + /** + * If false (the default), extract content from the full PDF + * as well as the XFA form. This will likely lead to some duplicative + * content. + * + * @param ifXFAExtractOnlyXFA + */ + public void setIfXFAExtractOnlyXFA(boolean ifXFAExtractOnlyXFA) { + this.ifXFAExtractOnlyXFA = ifXFAExtractOnlyXFA; + } + + + /** * @see #setExtractInlineImages(boolean) */ public boolean getExtractInlineImages() { @@ -411,6 +440,7 @@ public class PDFParserConfig implements Serializable { result = prime * result + (suppressDuplicateOverlappingText ? 1231 : 1237); result = prime * result + (useNonSequentialParser ? 1231 : 1237); + result = prime * result + (ifXFAExtractOnlyXFA ? 1231 : 1237); return result; } @@ -449,6 +479,9 @@ public class PDFParserConfig implements Serializable { return false; if (useNonSequentialParser != other.useNonSequentialParser) return false; + if (ifXFAExtractOnlyXFA != other.ifXFAExtractOnlyXFA) + return false; + return true; } @@ -460,6 +493,7 @@ public class PDFParserConfig implements Serializable { + extractAnnotationText + ", sortByPosition=" + sortByPosition + ", useNonSequentialParser=" + useNonSequentialParser + ", extractAcroFormContent=" + extractAcroFormContent + + ", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA + ", extractInlineImages=" + extractInlineImages + ", extractUniqueInlineImagesOnly=" + extractUniqueInlineImagesOnly + ", averageCharTolerance=" http://git-wip-us.apache.org/repos/asf/tika/blob/dbefe983/tika-parsers/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java new file mode 100644 index 0000000..3c2b496 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java @@ -0,0 +1,318 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pdf; + +import java.io.InputStream; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import javax.xml.namespace.QName; +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLResolver; +import javax.xml.stream.XMLStreamConstants; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.XMLStreamReader; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +/** + * This class offers an initial capability to + * scrape text containing elements out of XFA, and + * it tries to link fields with values. + * <p> + * Some areas for improvement: + * <ol> + * <li>convert this to 2 lines of XPath</li> + * <li>handle metadata stored in <desc> section (govdocs1: 754282.pdf, 982106.pdf)</li> + * <li>handle pdf metadata (access permissions, etc.) in <pdf> element</li> + * <li>extract different types of uris as metadata</li> + * <li>add extraction of <image> data (govdocs1: 754282.pdf)</li> + * <li>add computation of traversal order for fields</li> + * <li>figure out when text extracted from xfa fields is duplicative of that + * extracted from the rest of the pdf...and do this efficiently and quickly</li> + * <li>avoid duplication with <speak> and <tooltip> elements</li> + * </ol> + */ +class XFAExtractor { + + private static final Pattern XFA_TEMPLATE_ANY_VERSION = Pattern.compile("^http://www.xfa.org/schema/xfa-template"); + private static final Pattern TEXT_PATTERN = + Pattern.compile("^(speak|text|contents-richtext|toolTip|exData)$"); + + private static final String XFA_DATA_NS = "http://www.xfa.org/schema/xfa-data/1.0/"; + + private static final String FIELD_LN = "field"; + private static final QName XFA_DATA = new QName(XFA_DATA_NS, "data"); + + private static final XMLInputFactory factory; + + static { + factory = XMLInputFactory.newFactory(); + factory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, true); + factory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false); + factory.setProperty(XMLInputFactory.IS_VALIDATING, false); + factory.setXMLResolver(new XMLResolver() { + @Override + public Object resolveEntity(String publicID, String systemID, String baseURI, String namespace) throws XMLStreamException { + return null; + } + }); + } + private final Matcher xfaTemplateMatcher;//namespace any version + private final Matcher textMatcher; + + XFAExtractor() { + xfaTemplateMatcher = XFA_TEMPLATE_ANY_VERSION.matcher(""); + textMatcher = TEXT_PATTERN.matcher(""); + } + + void extract(InputStream xfaIs, XHTMLContentHandler xhtml, Metadata m) + throws XMLStreamException, SAXException { + xhtml.startElement("div", "class", "xfa_content"); + + Map<String, String> pdfObjRToValues = new HashMap<>(); + + //for now, store and dump the fields in insertion order + Map<String, XFAField> namedFields = new LinkedHashMap<>(); + + //The strategy is to cache the fields in fields + //and cache the values in pdfObjRToValues while + //handling the text etc along the way. + // + //As a final step, dump the merged fields and the values. + + XMLStreamReader reader = factory.createXMLStreamReader(xfaIs); + while (reader.hasNext()) { + switch (reader.next()) { + case XMLStreamConstants.START_ELEMENT : + QName name = reader.getName(); + String localName = name.getLocalPart(); + if (xfaTemplateMatcher.reset(name.getNamespaceURI()).find() && + FIELD_LN.equals(name.getLocalPart())) { + handleField(reader, namedFields); + } else if (XFA_DATA.equals(name)) {//full qname match is important! + loadData(reader, pdfObjRToValues); + } else if (textMatcher.reset(localName).find()) { + scrapeTextUntil(reader, xhtml, name); + } + break; + case XMLStreamConstants.END_ELEMENT : + break; + } + } + + if (namedFields.size() == 0) { + xhtml.endElement("xfa_content"); + return; + } + //now dump fields and values + xhtml.startElement("div", "class", "xfa_form"); + xhtml.startElement("ol"); + StringBuilder sb = new StringBuilder(); + for (Map.Entry<String, XFAField> e : namedFields.entrySet()) { + String fieldName = e.getKey(); + XFAField field = e.getValue(); + String fieldValue = pdfObjRToValues.get(fieldName); + AttributesImpl attrs = new AttributesImpl(); + attrs.addAttribute("", "fieldName", "fieldName", "CDATA", fieldName); + + String displayFieldName = (field.toolTip == null || + field.toolTip.trim().length() == 0) ? fieldName : field.toolTip; + + sb.append(displayFieldName).append(": "); + if (fieldValue != null) { + sb.append(fieldValue); + } + + xhtml.startElement("li", attrs); + xhtml.characters(sb.toString()); + xhtml.endElement("li"); + sb.setLength(0); + } + xhtml.endElement("ol"); + xhtml.endElement("div"); + xhtml.endElement("xfa_content"); + } + + //try to scrape the text until the endElement + private void scrapeTextUntil(XMLStreamReader reader, XHTMLContentHandler xhtml, + QName endElement) throws XMLStreamException, SAXException { + StringBuilder buffer = new StringBuilder(); + boolean keepGoing = true; + while (reader.hasNext() && keepGoing) { + switch (reader.next()) { + case XMLStreamConstants.START_ELEMENT: + break; + case XMLStreamConstants.CHARACTERS: + int start = reader.getTextStart(); + int length = reader.getTextLength(); + buffer.append(reader.getTextCharacters(), + start, + length); + break; + + case XMLStreamConstants.CDATA: + start = reader.getTextStart(); + length = reader.getTextLength(); + buffer.append(reader.getTextCharacters(), + start, + length); + break; + + case (XMLStreamConstants.END_ELEMENT): + if (reader.getName().equals(endElement)) { + keepGoing = false; + } else if ("p".equals(reader.getName().getLocalPart())) { + xhtml.element("p", buffer.toString()); + buffer.setLength(0); + } + break; + } + } + String remainder = buffer.toString(); + if (remainder.trim().length() > 0) { + xhtml.element("p", remainder); + } + } + + + private String scrapeTextUntil(XMLStreamReader reader, QName endElement) throws XMLStreamException { + StringBuilder buffer = new StringBuilder(); + boolean keepGoing = true; + while (reader.hasNext() && keepGoing) { + switch (reader.next()) { + case XMLStreamConstants.START_ELEMENT: + break; + case XMLStreamConstants.CHARACTERS: + int start = reader.getTextStart(); + int length = reader.getTextLength(); + buffer.append(reader.getTextCharacters(), + start, + length); + break; + + case XMLStreamConstants.CDATA: + start = reader.getTextStart(); + length = reader.getTextLength(); + buffer.append(reader.getTextCharacters(), + start, + length); + break; + + case (XMLStreamConstants.END_ELEMENT): + if (reader.getName().equals(endElement)) { + keepGoing = false; + } else if ("p".equals(reader.getName().getLocalPart())) { + buffer.append("\n"); + } + break; + } + } + return buffer.toString(); + } + + private void loadData(XMLStreamReader reader, Map<String, String> pdfObjRToValues) + throws XMLStreamException { + //reader is at the "xfa:data" element + while (reader.hasNext()) { + switch (reader.next()) { + case (XMLStreamConstants.START_ELEMENT) : + if ("topmostSubform".equals(reader.getLocalName())) { + continue; + } + String value = scrapeTextUntil(reader, reader.getName()); + pdfObjRToValues.put(reader.getLocalName(), value); + break; + case (XMLStreamConstants.END_ELEMENT) : + if (XFA_DATA.equals(reader.getName())) { + return; + } + break; + + } + } + } + + private void handleField(XMLStreamReader reader, Map<String, XFAField> fields) throws XMLStreamException { + //reader is set to the field element + String fieldName = findFirstAttributeValue(reader, "name"); + String pdfObjRef = ""; + String toolTip = ""; + while (reader.hasNext()) { + switch (reader.next()) { + case XMLStreamConstants.START_ELEMENT : + if ("toolTip".equals(reader.getName().getLocalPart())) { + toolTip = scrapeTextUntil(reader, reader.getName()); + } + // add checkbutton, etcif (reader.getName().equals()) + break; + case XMLStreamConstants.END_ELEMENT : + if (xfaTemplateMatcher.reset(reader.getName().getNamespaceURI()).find() && + FIELD_LN.equals(reader.getName().getLocalPart())) { + if (fieldName != null) { + fields.put(fieldName, new XFAField(fieldName, toolTip, pdfObjRef)); + } + return; + } + break; + case XMLStreamConstants.PROCESSING_INSTRUCTION: + if ("PDF_OBJR".equals(reader.getPITarget())) { + pdfObjRef = reader.getPIData(); + } + break; + + } + } + } + + private String findFirstAttributeValue(XMLStreamReader reader, String name) { + for (int i = 0; i < reader.getAttributeCount(); i++) { + String n = reader.getAttributeLocalName(i); + if (name.equals(n)) { + return reader.getAttributeValue(i); + } + } + return ""; + } + + class XFAField { + String fieldName; + String toolTip; + String pdfObjRef; + String value; + + public XFAField(String fieldName, String toolTip, String pdfObjRef) { + this.fieldName = fieldName; + this.toolTip = toolTip; + this.pdfObjRef = pdfObjRef; + } + + @Override + public String toString() { + return "XFAField{" + + "fieldName='" + fieldName + '\'' + + ", toolTip='" + toolTip + '\'' + + ", pdfObjRef='" + pdfObjRef + '\'' + + ", value='" + value + '\'' + + '}'; + } + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/dbefe983/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties b/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties index 1585f2d..bcfe1c6 100644 --- a/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties +++ b/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -enableAutospace true +enableAutoSpace true extractAnnotationText true sortByPosition false suppressDuplicateOverlappingText false @@ -23,3 +23,4 @@ extractInlineImages false extractUniqueInlineImagesOnly true checkExtractAccessPermission false allowExtractionForAccessibility true +ifXFAExtractOnlyXFA false \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/dbefe983/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 581faaa..04d9f2b 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -24,13 +24,13 @@ import static org.junit.Assert.assertTrue; import java.io.File; import java.io.FileInputStream; import java.io.InputStream; +import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; - import org.apache.commons.io.IOUtils; import org.apache.log4j.Level; import org.apache.log4j.Logger; @@ -1328,6 +1328,36 @@ public class PDFParserTest extends TikaTest { assertEquals("Microsoft", r.metadata.get(TikaCoreProperties.TITLE)); } + @Test + public void testXFAExtractionBasic() throws Exception { + XMLResult r = getXML("testPDF_XFA_govdocs1_258578.pdf"); + //contains content existing only in the "regular" pdf + assertContains("Mount Rushmore National Memorial", r.xml); + //contains xfa fields and data + assertContains("<li fieldName=\"School_Name\">School Name: my_school</li>", + r.xml); + } + + @Test + public void testXFAOnly() throws Exception { + ParseContext context = new ParseContext(); + + PDFParserConfig config = new PDFParserConfig(); + config.setIfXFAExtractOnlyXFA(true); + context.set(PDFParserConfig.class, config); + ContentHandler handler = new ToXMLContentHandler(StandardCharsets.UTF_8.name()); + Metadata metadata = new Metadata(); + Parser parser = new AutoDetectParser(); + try (InputStream is = getResourceAsStream("/test-documents/testPDF_XFA_govdocs1_258578.pdf")) { + parser.parse(is, handler, metadata, context); + } + String xml = handler.toString(); + assertContains("<li fieldName=\"Room_1\">Room [1]: my_room1</li>", xml); + assertContains("</xfa_content></body></html>", xml); + + assertNotContained("Mount Rushmore National Memorial", xml); + } + private void assertException(String path, Parser parser, ParseContext context, Class expected) { boolean noEx = false; InputStream is = getResourceAsStream(path); http://git-wip-us.apache.org/repos/asf/tika/blob/dbefe983/tika-parsers/src/test/resources/test-documents/testPDF_XFA_govdocs1_258578.pdf ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_XFA_govdocs1_258578.pdf b/tika-parsers/src/test/resources/test-documents/testPDF_XFA_govdocs1_258578.pdf new file mode 100644 index 0000000..e3fb803 Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPDF_XFA_govdocs1_258578.pdf differ
