This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 7866ce3 TIKA-2965 -- add metadata flags for XFA/XMP and AcroForm
fields
7866ce3 is described below
commit 7866ce3cc182a02872f504640db22af4fcb3691a
Author: tallison <[email protected]>
AuthorDate: Thu Oct 17 12:39:32 2019 -0400
TIKA-2965 -- add metadata flags for XFA/XMP and AcroForm fields
---
.../main/java/org/apache/tika/metadata/PDF.java | 15 +++++++++
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 1 +
.../java/org/apache/tika/parser/pdf/PDFParser.java | 39 +++++++++-------------
.../tika/parser/pdf/PDMetadataExtractor.java | 5 +++
.../org/apache/tika/parser/pdf/PDFParserTest.java | 21 +++++++++---
5 files changed, 53 insertions(+), 28 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index 054b7e0..d9a6213 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -75,4 +75,19 @@ public interface PDF {
Property UNMAPPED_UNICODE_CHARS_PER_PAGE =
Property.internalIntegerSequence(PDF_PREFIX+"unmappedUnicodeCharsPerPage");
+
+ /**
+ * Has XFA
+ */
+ Property HAS_XFA = Property.internalBoolean(PDF_PREFIX+"hasXFA");
+
+ /**
+ * Has XMP, whether or not it is valid
+ */
+ Property HAS_XMP = Property.internalBoolean(PDF_PREFIX+"hasXMP");
+
+ /**
+ * Has > 0 AcroForm fields
+ */
+ Property HAS_ACROFORM_FIELDS =
Property.internalBoolean(PDF_PREFIX+"hasAcroFormFields");
}
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 27e4df8..f15c71b 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -759,6 +759,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
//if there is, process it
if (nonNull > 0) {
+ metadata.add(TikaCoreProperties.HAS_SIGNATURE, "true");
xhtml.startElement("li", parentAttributes);
AttributesImpl attrs = new AttributesImpl();
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 38c367d..203cf12 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -29,22 +29,13 @@ import java.util.Map;
import java.util.Set;
import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.jempbox.xmp.XMPMetadata;
-import org.apache.jempbox.xmp.XMPSchema;
-import org.apache.jempbox.xmp.XMPSchemaDublinCore;
-import org.apache.jempbox.xmp.pdfa.XMPSchemaPDFAId;
-import org.apache.pdfbox.cos.COSArray;
-import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
-import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
-import org.apache.pdfbox.pdmodel.common.PDMetadata;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
-import org.apache.poi.util.IOUtils;
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
@@ -53,7 +44,6 @@ import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.AccessPermissions;
import org.apache.tika.metadata.Metadata;
@@ -61,17 +51,13 @@ import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.PDF;
import org.apache.tika.metadata.PagedText;
-import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.PasswordProvider;
-import org.apache.tika.parser.image.xmp.JempboxExtractor;
import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.sax.XHTMLContentHandler;
-import org.apache.tika.utils.XMLReaderUtils;
-import org.w3c.dom.Document;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -160,7 +146,9 @@ public class PDFParser extends AbstractParser implements
Initializable {
AccessChecker checker = localConfig.getAccessChecker();
checker.check(metadata);
if (handler != null) {
- if (shouldHandleXFAOnly(pdfDocument, localConfig)) {
+ boolean hasXFA = hasXFA(pdfDocument);
+ metadata.set(PDF.HAS_XFA, Boolean.toString(hasXFA));
+ if (shouldHandleXFAOnly(hasXFA, localConfig)) {
handleXFAOnly(pdfDocument, handler, metadata, context);
} else if
(localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) {
metadata.add("X-Parsed-By",
TesseractOCRParser.class.toString());
@@ -229,7 +217,11 @@ public class PDFParser extends AbstractParser implements
Initializable {
if (document.getDocumentCatalog().getLanguage() != null) {
metadata.set(TikaCoreProperties.LANGUAGE,
document.getDocumentCatalog().getLanguage());
}
-
+ if (document.getDocumentCatalog().getAcroForm() != null &&
+ document.getDocumentCatalog().getAcroForm().getFields() != null &&
+ document.getDocumentCatalog().getAcroForm().getFields().size() >
0) {
+ metadata.set(PDF.HAS_ACROFORM_FIELDS, "true");
+ }
PDMetadataExtractor.extract(document.getDocumentCatalog().getMetadata(),
metadata, context);
PDDocumentInformation info = document.getDocumentInformation();
@@ -315,15 +307,14 @@ public class PDFParser extends AbstractParser implements
Initializable {
}
+ private boolean hasXFA(PDDocument pdDocument) {
+ return pdDocument.getDocumentCatalog() != null &&
+ pdDocument.getDocumentCatalog().getAcroForm() != null &&
+ pdDocument.getDocumentCatalog().getAcroForm().hasXFA();
+ }
- private boolean shouldHandleXFAOnly(PDDocument pdDocument, PDFParserConfig
config) {
- if (config.getIfXFAExtractOnlyXFA() &&
- pdDocument.getDocumentCatalog() != null &&
- pdDocument.getDocumentCatalog().getAcroForm() != null &&
- pdDocument.getDocumentCatalog().getAcroForm().getXFA() != null) {
- return true;
- }
- return false;
+ private boolean shouldHandleXFAOnly(boolean hasXFA, PDFParserConfig
config) {
+ return config.getIfXFAExtractOnlyXFA() && hasXFA;
}
private void handleXFAOnly(PDDocument pdDocument, ContentHandler handler,
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
index 984a38b..374471b 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
@@ -52,8 +52,12 @@ class PDMetadataExtractor {
static void extract(PDMetadata pdMetadata, Metadata metadata, ParseContext
context) {
if (pdMetadata == null) {
+ metadata.set(PDF.HAS_XMP, "false");
return;
}
+ //this file has XMP...
+ //whether or not it is readable or throws an exception is another
story...
+ metadata.set(PDF.HAS_XMP, "true");
//now go for the XMP
Document dom = loadDOM(pdMetadata, metadata, context);
@@ -225,6 +229,7 @@ class PDMetadataExtractor {
if (pdMetadata == null) {
return null;
}
+
InputStream is = null;
try {
try {
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 5eab912..15aec20 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -24,7 +24,6 @@ import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
-import java.io.File;
import java.io.InputStream;
import java.util.Arrays;
import java.util.HashMap;
@@ -614,13 +613,17 @@ public class PDFParserTest extends TikaTest {
//The current test doc does not contain any content in the signature
area.
//This just tests that a RuntimeException is not thrown.
//TODO: find a better test file for this issue.
- String xml = getXML("/testPDF_acroform3.pdf").xml;
- assertTrue("found", (xml.contains("<li>aTextField: TIKA-1226</li>")));
+ XMLResult result = getXML("testPDF_acroform3.pdf");
+ Metadata m = result.metadata;
+ assertEquals("true", m.get(PDF.HAS_XMP));
+ assertEquals("true", m.get(PDF.HAS_ACROFORM_FIELDS));
+ assertEquals("false", m.get(PDF.HAS_XFA));
+ assertTrue("found", (result.xml.contains("<li>aTextField:
TIKA-1226</li>")));
}
@Test // TIKA-1228, TIKA-1268
public void testEmbeddedFilesInChildren() throws Exception {
- String xml = getXML("/testPDF_childAttachments.pdf").xml;
+ String xml = getXML("testPDF_childAttachments.pdf").xml;
//"regressiveness" exists only in Unit10.doc not in the container pdf
document
assertTrue(xml.contains("regressiveness"));
@@ -1083,6 +1086,12 @@ public class PDFParserTest extends TikaTest {
}
@Test
+ public void testNoXMP() throws Exception {
+ assertEquals("false",
+ getXML("testPDF.pdf").metadata.get(PDF.HAS_XMP));
+ }
+
+ @Test
public void testPDFEncodedStringsInXMP() throws Exception {
//TIKA-1678
XMLResult r = getXML("testPDF_PDFEncodedStringInXMP.pdf");
@@ -1092,6 +1101,10 @@ public class PDFParserTest extends TikaTest {
@Test
public void testXFAExtractionBasic() throws Exception {
XMLResult r = getXML("testPDF_XFA_govdocs1_258578.pdf");
+ Metadata m = r.metadata;
+ assertEquals("true", m.get(PDF.HAS_XFA));
+ assertEquals("true", m.get(PDF.HAS_ACROFORM_FIELDS));
+ assertEquals("true", m.get(PDF.HAS_XMP));
//contains content existing only in the "regular" pdf
assertContains("Mount Rushmore National Memorial", r.xml);
//contains xfa fields and data